diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4f6ed9de30efe4..83191254f1a229 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -130,7 +130,7 @@ if(WIN32)
         # NOTE(zhouwei25): GPU compile have too high memory utilization when parallel compiling,
         # For Visual Studio generators, /MP should be added. 
         # For other generators like Ninja, it is not need to add /MP.
-        if("${CMAKE_GENERATOR}" STREQUAL "Visual Studio" AND NOT WITH_GPU)
+        if(CMAKE_GENERATOR MATCHES "Visual Studio" AND NOT WITH_GPU)
             math(EXPR PROCESS_MAX "${CPU_CORES} * 2 / 3")
             set(${flag_var} "${${flag_var}} /MP${PROCESS_MAX}")
         endif()
diff --git a/cmake/external/gflags.cmake b/cmake/external/gflags.cmake
index 8360761de6fb98..0f9739014d52bf 100644
--- a/cmake/external/gflags.cmake
+++ b/cmake/external/gflags.cmake
@@ -41,6 +41,7 @@ ExternalProject_Add(
     ${SHALLOW_CLONE}
     "${GFLAGS_DOWNLOAD_CMD}"
     PREFIX          ${GFLAGS_PREFIX_DIR}
+    UPDATE_COMMAND  ""
     SOURCE_DIR      ${GFLAGS_SOURCE_DIR}
     BUILD_COMMAND   ${BUILD_COMMAND}
     INSTALL_COMMAND ${INSTALL_COMMAND}
diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake
index d2bb1e62e83de3..b9dbe90a92e6f4 100644
--- a/cmake/external/glog.cmake
+++ b/cmake/external/glog.cmake
@@ -45,6 +45,7 @@ ExternalProject_Add(
     DEPENDS         gflags
     PREFIX          ${GLOG_PREFIX_DIR}
     SOURCE_DIR      ${GLOG_SOURCE_DIR}
+    UPDATE_COMMAND  ""
     CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
                     -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
                     -DCMAKE_CXX_FLAGS=${GLOG_CMAKE_CXX_FLAGS}
diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake
index 9963237ff188cf..0a3b64e5d56821 100644
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -79,22 +79,10 @@ ExternalProject_Add(
                         -DCMAKE_CXX_FLAGS=${MKLDNN_CXXFLAG}
                         -DDNNL_BUILD_TESTS=OFF -DDNNL_BUILD_EXAMPLES=OFF
     CMAKE_CACHE_ARGS    -DCMAKE_INSTALL_PREFIX:PATH=${MKLDNN_INSTALL_DIR}
-    BUILD_BYPRODUCTS    ${MKLDNN_LIB}
 )
 
-ADD_LIBRARY(shared_mkldnn SHARED IMPORTED GLOBAL)
-SET_PROPERTY(TARGET shared_mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIB})
-ADD_DEPENDENCIES(shared_mkldnn ${MKLDNN_PROJECT})
 MESSAGE(STATUS "MKLDNN library: ${MKLDNN_LIB}")
 add_definitions(-DPADDLE_WITH_MKLDNN)
-
-# generate a static dummy target to track mkldnn dependencies
-# for cc_library(xxx SRCS xxx.c DEPS mkldnn)
-generate_dummy_static_lib(LIB_NAME "mkldnn" GENERATOR "mkldnn.cmake")
-
-TARGET_LINK_LIBRARIES(mkldnn ${MKLDNN_LIB} ${MKLML_IOMP_LIB})
-ADD_DEPENDENCIES(mkldnn ${MKLDNN_PROJECT})
-
 # copy the real so.0 lib to install dir
 # it can be directly contained in wheel or capi
 if(WIN32)
@@ -102,26 +90,33 @@ if(WIN32)
 
     file(TO_NATIVE_PATH ${MKLDNN_INSTALL_DIR} NATIVE_MKLDNN_INSTALL_DIR)
     file(TO_NATIVE_PATH ${MKLDNN_SHARED_LIB} NATIVE_MKLDNN_SHARED_LIB)
-    ADD_CUSTOM_COMMAND(TARGET ${MKLDNN_PROJECT} POST_BUILD
-        COMMAND (copy ${NATIVE_MKLDNN_INSTALL_DIR}\\bin\\dnnl.dll ${NATIVE_MKLDNN_SHARED_LIB} /Y))
-    add_custom_command(TARGET ${MKLDNN_PROJECT} POST_BUILD VERBATIM
-        COMMAND dumpbin /exports ${MKLDNN_INSTALL_DIR}/bin/mkldnn.dll > ${MKLDNN_INSTALL_DIR}/bin/exports.txt)
-    add_custom_command(TARGET ${MKLDNN_PROJECT} POST_BUILD VERBATIM
-        COMMAND echo LIBRARY mkldnn > ${MKLDNN_INSTALL_DIR}/bin/mkldnn.def)
-    add_custom_command(TARGET ${MKLDNN_PROJECT} POST_BUILD VERBATIM
-        COMMAND echo EXPORTS >> ${MKLDNN_INSTALL_DIR}/bin/mkldnn.def)
-    add_custom_command(TARGET ${MKLDNN_PROJECT} POST_BUILD VERBATIM
-        COMMAND echo off && (for /f "skip=19 tokens=4" %A in (${MKLDNN_INSTALL_DIR}/bin/exports.txt) do echo %A >> ${MKLDNN_INSTALL_DIR}/bin/mkldnn.def) && echo on)
-    add_custom_command(TARGET ${MKLDNN_PROJECT} POST_BUILD VERBATIM
-        COMMAND lib /def:${MKLDNN_INSTALL_DIR}/bin/mkldnn.def /out:${MKLDNN_INSTALL_DIR}/bin/mkldnn.lib /machine:x64)
+
+    ADD_CUSTOM_COMMAND(OUTPUT ${MKLDNN_LIB}
+        COMMAND (copy ${NATIVE_MKLDNN_INSTALL_DIR}\\bin\\dnnl.dll ${NATIVE_MKLDNN_SHARED_LIB} /Y)
+        COMMAND dumpbin /exports ${MKLDNN_INSTALL_DIR}/bin/mkldnn.dll > ${MKLDNN_INSTALL_DIR}/bin/exports.txt
+        COMMAND echo LIBRARY mkldnn > ${MKLDNN_INSTALL_DIR}/bin/mkldnn.def
+        COMMAND echo EXPORTS >> ${MKLDNN_INSTALL_DIR}/bin/mkldnn.def
+        COMMAND echo off && (for /f "skip=19 tokens=4" %A in (${MKLDNN_INSTALL_DIR}/bin/exports.txt) do echo %A >> ${MKLDNN_INSTALL_DIR}/bin/mkldnn.def) && echo on
+        COMMAND lib /def:${MKLDNN_INSTALL_DIR}/bin/mkldnn.def /out:${MKLDNN_LIB} /machine:x64
+        COMMENT "Generate mkldnn.lib manually--->"
+        DEPENDS ${MKLDNN_PROJECT}
+        VERBATIM)
+    ADD_CUSTOM_TARGET(mkldnn_cmd ALL DEPENDS ${MKLDNN_LIB})
 else(WIN32)
     SET(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/libmkldnn.so.0)
     SET(MKLDNN_SHARED_LIB_1 ${MKLDNN_INSTALL_DIR}/libdnnl.so.1)
     SET(MKLDNN_SHARED_LIB_2 ${MKLDNN_INSTALL_DIR}/libdnnl.so.2)
-    ADD_CUSTOM_COMMAND(TARGET ${MKLDNN_PROJECT} POST_BUILD
-            COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_LIB} ${MKLDNN_SHARED_LIB})
-    ADD_CUSTOM_COMMAND(TARGET ${MKLDNN_PROJECT} POST_BUILD
-            COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_LIB} ${MKLDNN_SHARED_LIB_1})
-    ADD_CUSTOM_COMMAND(TARGET ${MKLDNN_PROJECT} POST_BUILD
-            COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_LIB} ${MKLDNN_SHARED_LIB_2})
+    ADD_CUSTOM_COMMAND(OUTPUT ${MKLDNN_SHARED_LIB_2}
+        COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_LIB} ${MKLDNN_SHARED_LIB}
+        COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_LIB} ${MKLDNN_SHARED_LIB_1}
+        COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_LIB} ${MKLDNN_SHARED_LIB_2}
+        DEPENDS ${MKLDNN_PROJECT})
+    ADD_CUSTOM_TARGET(mkldnn_cmd ALL DEPENDS ${MKLDNN_SHARED_LIB_2})
 endif(WIN32)
+
+# generate a static dummy target to track mkldnn dependencies
+# for cc_library(xxx SRCS xxx.c DEPS mkldnn)
+generate_dummy_static_lib(LIB_NAME "mkldnn" GENERATOR "mkldnn.cmake")
+
+TARGET_LINK_LIBRARIES(mkldnn ${MKLDNN_LIB} ${MKLML_IOMP_LIB})
+ADD_DEPENDENCIES(mkldnn ${MKLDNN_PROJECT} mkldnn_cmd)
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index a2b6ddadb625f6..8a9bc6e42c1464 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -198,16 +198,16 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
             "-Dprotobuf_MSVC_STATIC_RUNTIME=${MSVC_STATIC_CRT}")
     ENDIF()
 
-if(WITH_ASCEND AND NOT WITH_ASCEND_CXX11)
-    SET(PROTOBUF_REPOSITORY  https://gitee.com/tianjianhe/protobuf.git)
-    SET(PROTOBUF_TAG         v3.8.0)
-elseif(WITH_ASCEND_CL AND NOT WITH_ASCEND_CXX11)
-    SET(PROTOBUF_REPOSITORY  https://gitee.com/tianjianhe/protobuf.git)
-    SET(PROTOBUF_TAG         v3.8.0)
-else()
-    SET(PROTOBUF_REPOSITORY  ${GIT_URL}/protocolbuffers/protobuf.git)
-    SET(PROTOBUF_TAG         9f75c5aa851cd877fb0d93ccc31b8567a6706546)
-endif()
+    if(WITH_ASCEND AND NOT WITH_ASCEND_CXX11)
+        SET(PROTOBUF_REPOSITORY  https://gitee.com/tianjianhe/protobuf.git)
+        SET(PROTOBUF_TAG         v3.8.0)
+    elseif(WITH_ASCEND_CL AND NOT WITH_ASCEND_CXX11)
+        SET(PROTOBUF_REPOSITORY  https://gitee.com/tianjianhe/protobuf.git)
+        SET(PROTOBUF_TAG         v3.8.0)
+    else()
+        SET(PROTOBUF_REPOSITORY  ${GIT_URL}/protocolbuffers/protobuf.git)
+        SET(PROTOBUF_TAG         9f75c5aa851cd877fb0d93ccc31b8567a6706546)
+    endif()
 
     cache_third_party(${TARGET_NAME}
         REPOSITORY    ${PROTOBUF_REPOSITORY}
diff --git a/cmake/external/pybind11.cmake b/cmake/external/pybind11.cmake
index 69bd68c2778497..353cb5c72fdfb9 100644
--- a/cmake/external/pybind11.cmake
+++ b/cmake/external/pybind11.cmake
@@ -39,6 +39,7 @@ ExternalProject_Add(
         # to be modified without triggering incremental compilation, and the
         # third-party library version changes cannot be incorporated.
         # reference: https://cmake.org/cmake/help/latest/module/ExternalProject.html
+        UPDATE_COMMAND    ""
         CONFIGURE_COMMAND ""
         BUILD_COMMAND     ""
         INSTALL_COMMAND   ""
diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake
index 640e2e37ad434d..aa41173c81a22a 100644
--- a/cmake/external/xpu.cmake
+++ b/cmake/external/xpu.cmake
@@ -35,7 +35,7 @@ ELSE ()
 ENDIF()
 
 SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev")
-SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20210729")
+SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20210804")
 SET(XPU_XRE_URL  "${XPU_BASE_URL}/${XPU_XRE_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
 SET(XPU_XDNN_URL "${XPU_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
 SET(XPU_XCCL_URL "${XPU_BASE_URL_WITHOUT_DATE}/20210623/${XPU_XCCL_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index 3dcf0b74f7940f..669875d81dfecc 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -205,6 +205,9 @@ copy(inference_lib_dist
 copy(inference_lib_dist
         SRCS  ${PADDLE_SOURCE_DIR}/paddle/fluid/platform/float16.h
         DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/)
+copy(inference_lib_dist
+        SRCS  ${PADDLE_SOURCE_DIR}/paddle/utils/any.h
+        DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/)
 
 # CAPI inference library for only inference
 set(PADDLE_INFERENCE_C_INSTALL_DIR "${CMAKE_BINARY_DIR}/paddle_inference_c_install_dir" CACHE STRING
diff --git a/paddle/fluid/distributed/common/sparse_sharding_merge.h b/paddle/fluid/distributed/common/sparse_sharding_merge.h
index 3f84b5c4b212e2..3bd36e65ba8521 100644
--- a/paddle/fluid/distributed/common/sparse_sharding_merge.h
+++ b/paddle/fluid/distributed/common/sparse_sharding_merge.h
@@ -21,7 +21,6 @@
 #include <vector>
 
 #include <ThreadPool.h>
-#include "boost/lexical_cast.hpp"
 #include "glog/logging.h"
 #include "paddle/fluid/distributed/common/utils.h"
 #include "paddle/fluid/framework/blocking_queue.h"
@@ -36,8 +35,6 @@ constexpr int Q_SIZE = 10000;
 constexpr int BUCKET = 10;
 constexpr char XEOF[] = "EOF";
 
-using boost::lexical_cast;
-
 inline double GetCurrentUS() {
   struct timeval time;
   gettimeofday(&time, NULL);
@@ -208,8 +205,10 @@ class ShardingMerge {
         for (int x = 0; x < embedding_dim; ++x) {
           float v = 0.0;
           try {
-            v = lexical_cast<float>(values_str[x]);
-          } catch (boost::bad_lexical_cast &e) {
+            v = std::stof(values_str[x]);
+          } catch (std::invalid_argument &e) {
+            VLOG(0) << " get unexpected line: " << line;
+          } catch (std::out_of_range &e) {
             VLOG(0) << " get unexpected line: " << line;
           }
           out->push_back(v);
diff --git a/paddle/fluid/distributed/index_dataset/index_wrapper.cc b/paddle/fluid/distributed/index_dataset/index_wrapper.cc
index 99fe4ca0c6d043..7a9691f3602e26 100644
--- a/paddle/fluid/distributed/index_dataset/index_wrapper.cc
+++ b/paddle/fluid/distributed/index_dataset/index_wrapper.cc
@@ -17,8 +17,6 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/io/fs.h"
 
-#include <boost/algorithm/string.hpp>
-#include <boost/lexical_cast.hpp>
 #include "paddle/fluid/distributed/index_dataset/index_wrapper.h"
 
 namespace paddle {
@@ -65,7 +63,7 @@ int TreeIndex::Load(const std::string filename) {
     if (item.key() == ".tree_meta") {
       meta_.ParseFromString(item.value());
     } else {
-      auto code = boost::lexical_cast<uint64_t>(item.key());
+      auto code = std::stoull(item.key());
       IndexNode node;
       node.ParseFromString(item.value());
       PADDLE_ENFORCE_NE(node.id(), 0,
diff --git a/paddle/fluid/distributed/table/common_sparse_table.cc b/paddle/fluid/distributed/table/common_sparse_table.cc
index e1223face0f54a..8b79b1c02fce5e 100644
--- a/paddle/fluid/distributed/table/common_sparse_table.cc
+++ b/paddle/fluid/distributed/table/common_sparse_table.cc
@@ -15,7 +15,6 @@
 #include "paddle/fluid/distributed/table/common_sparse_table.h"
 #include <sstream>
 
-#include "boost/lexical_cast.hpp"
 #include "glog/logging.h"
 #include "paddle/fluid/platform/enforce.h"
 
@@ -50,8 +49,11 @@ void CommonSparseTable::ProcessALine(const std::vector<std::string>& columns,
       float v = 0.0;
 
       try {
-        v = lexical_cast<float>(va);
-      } catch (boost::bad_lexical_cast& e) {
+        v = std::stof(va);
+      } catch (std::invalid_argument& e) {
+        VLOG(0) << "id: " << id << " get unexpected value: " << va
+                << " and be reset to: 0.0";
+      } catch (std::out_of_range& e) {
         VLOG(0) << "id: " << id << " get unexpected value: " << va
                 << " and be reset to: 0.0";
       }
@@ -131,7 +133,7 @@ int64_t CommonSparseTable::LoadFromText(
 
   while (std::getline(file, line)) {
     auto values = paddle::string::split_string<std::string>(line, "\t");
-    auto id = lexical_cast<uint64_t>(values[0]);
+    auto id = std::stoull(values[0]);
 
     if (id % pserver_num != pserver_id) {
       VLOG(3) << "will not load " << values[0] << " from " << valuepath
@@ -150,10 +152,9 @@ int64_t CommonSparseTable::LoadFromText(
     VALUE* value_instant = block->GetValue(id);
 
     if (values.size() == 5) {
-      value_instant->count_ = lexical_cast<int>(values[1]);
-      value_instant->unseen_days_ = lexical_cast<int>(values[2]);
-      value_instant->is_entry_ =
-          static_cast<bool>(lexical_cast<int>(values[3]));
+      value_instant->count_ = std::stoi(values[1]);
+      value_instant->unseen_days_ = std::stoi(values[2]);
+      value_instant->is_entry_ = static_cast<bool>(std::stoi(values[3]));
     }
 
     std::vector<float*> block_values = block->Get(id, meta.names, meta.dims);
diff --git a/paddle/fluid/distributed/table/common_sparse_table.h b/paddle/fluid/distributed/table/common_sparse_table.h
index ce3cc11686a480..a443710bf0fd82 100644
--- a/paddle/fluid/distributed/table/common_sparse_table.h
+++ b/paddle/fluid/distributed/table/common_sparse_table.h
@@ -33,7 +33,6 @@
 #include "paddle/fluid/string/string_helper.h"
 
 #define PSERVER_SAVE_SUFFIX ".shard"
-using boost::lexical_cast;
 
 namespace paddle {
 namespace distributed {
diff --git a/paddle/fluid/distributed/table/ssd_sparse_table.cc b/paddle/fluid/distributed/table/ssd_sparse_table.cc
index 5de6de3d2909d6..41eca72cf80717 100644
--- a/paddle/fluid/distributed/table/ssd_sparse_table.cc
+++ b/paddle/fluid/distributed/table/ssd_sparse_table.cc
@@ -310,7 +310,7 @@ int64_t SSDSparseTable::LoadFromText(
 
   while (std::getline(file, line)) {
     auto values = paddle::string::split_string<std::string>(line, "\t");
-    auto id = lexical_cast<uint64_t>(values[0]);
+    auto id = std::stoull(values[0]);
 
     if (id % pserver_num != pserver_id) {
       VLOG(3) << "will not load " << values[0] << " from " << valuepath
@@ -329,10 +329,9 @@ int64_t SSDSparseTable::LoadFromText(
     VALUE* value_instant = block->GetValue(id);
 
     if (values.size() == 5) {
-      value_instant->count_ = lexical_cast<int>(values[1]);
-      value_instant->unseen_days_ = lexical_cast<int>(values[2]);
-      value_instant->is_entry_ =
-          static_cast<bool>(lexical_cast<int>(values[3]));
+      value_instant->count_ = std::stoi(values[1]);
+      value_instant->unseen_days_ = std::stoi(values[2]);
+      value_instant->is_entry_ = static_cast<bool>(std::stoi(values[3]));
     }
 
     std::vector<float*> block_values = block->Get(id, meta.names, meta.dims);
diff --git a/paddle/fluid/extension/include/ext_op_meta_info.h b/paddle/fluid/extension/include/ext_op_meta_info.h
index c400164c7543da..6f2528030e603d 100644
--- a/paddle/fluid/extension/include/ext_op_meta_info.h
+++ b/paddle/fluid/extension/include/ext_op_meta_info.h
@@ -19,8 +19,7 @@ limitations under the License. */
 #include <unordered_map>
 #include <vector>
 
-#include <boost/any.hpp>
-
+#include "any.h"
 #include "ext_dll_decl.h"   // NOLINT
 #include "ext_exception.h"  // NOLINT
 #include "ext_tensor.h"     // NOLINT
@@ -83,7 +82,7 @@ inline std::string Vec(const std::string& t_name) {
 using KernelFunc =
     std::vector<Tensor> (*)(const std::vector<Tensor>& inputs,
                             const std::vector<std::vector<Tensor>>& vec_inputs,
-                            const std::vector<boost::any>& attrs);
+                            const std::vector<paddle::any>& attrs);
 
 #define PD_SPECIALIZE_ComputeCallHelper(attr_type)                            \
   template <typename... Tail>                                                 \
@@ -92,14 +91,14 @@ using KernelFunc =
               typename... PreviousArgs>                                       \
     static Return Compute(const std::vector<Tensor>& inputs,                  \
                           const std::vector<std::vector<Tensor>>& vec_inputs, \
-                          const std::vector<boost::any>& attrs,               \
+                          const std::vector<paddle::any>& attrs,              \
                           const PreviousArgs&... pargs) {                     \
       try {                                                                   \
-        attr_type arg = boost::any_cast<attr_type>(attrs[attr_idx]);          \
+        attr_type arg = paddle::any_cast<attr_type>(attrs[attr_idx]);         \
         return ComputeCallHelper<Tail...>::template Compute<                  \
             in_idx, vec_in_idx, attr_idx + 1>(inputs, vec_inputs, attrs,      \
                                               pargs..., arg);                 \
-      } catch (boost::bad_any_cast&) {                                        \
+      } catch (paddle::bad_any_cast&) {                                       \
         PD_THROW(                                                             \
             "Attribute cast error in custom operator. Expected " #attr_type   \
             " value.");                                                       \
@@ -117,7 +116,7 @@ template <typename Return, typename... Args, Return (*impl_fn)(Args...)>
 struct KernelFuncImpl<Return (*)(Args...), impl_fn> {
   static Return Compute(const std::vector<Tensor>& inputs,
                         const std::vector<std::vector<Tensor>>& vec_inputs,
-                        const std::vector<boost::any>& attrs) {
+                        const std::vector<paddle::any>& attrs) {
     return ComputeCallHelper<Args..., TypeTag<int>>::template Compute<0, 0, 0>(
         inputs, vec_inputs, attrs);
   }
@@ -132,7 +131,7 @@ struct KernelFuncImpl<Return (*)(Args...), impl_fn> {
               typename... PreviousArgs>
     static Return Compute(const std::vector<Tensor>& inputs,
                           const std::vector<std::vector<Tensor>>& vec_inputs,
-                          const std::vector<boost::any>& attrs,
+                          const std::vector<paddle::any>& attrs,
                           const PreviousArgs&... pargs) {
       const Tensor& arg = inputs[in_idx];
       return ComputeCallHelper<Tail...>::template Compute<in_idx + 1,
@@ -147,7 +146,7 @@ struct KernelFuncImpl<Return (*)(Args...), impl_fn> {
               typename... PreviousArgs>
     static Return Compute(const std::vector<Tensor>& inputs,
                           const std::vector<std::vector<Tensor>>& vec_inputs,
-                          const std::vector<boost::any>& attrs,
+                          const std::vector<paddle::any>& attrs,
                           const PreviousArgs&... pargs) {
       const std::vector<Tensor>& arg = vec_inputs[vec_in_idx];
       return ComputeCallHelper<Tail...>::template Compute<
@@ -189,7 +188,7 @@ struct KernelFuncImpl<Return (*)(Args...), impl_fn> {
     template <int in_idx, int vec_in_idx, int attr_idx>
     static Return Compute(const std::vector<Tensor>& inputs,
                           const std::vector<std::vector<Tensor>>& vec_inputs,
-                          const std::vector<boost::any>& attrs,
+                          const std::vector<paddle::any>& attrs,
                           const Args&... args) {
       return impl_fn(args...);
     }
@@ -205,67 +204,67 @@ struct KernelFuncImpl<Return (*)(Args...), impl_fn> {
 using InferShapeFunc = std::vector<std::vector<int64_t>> (*)(
     const std::vector<std::vector<int64_t>>& input_shapes,
     const std::vector<std::vector<std::vector<int64_t>>>& vec_input_shapes,
-    const std::vector<boost::any>& attrs);
-
-#define PD_SPECIALIZE_InferShapeCallHelper_FOR_SHAPE(input_type)              \
-  template <typename... Tail>                                                 \
-  struct InferShapeCallHelper<input_type, Tail...> {                          \
-    template <int in_idx, int vec_in_idx, int attr_idx,                       \
-              typename... PreviousArgs>                                       \
-    static Return InferShape(                                                 \
-        const std::vector<std::vector<int64_t>>& input_shapes,                \
-        const std::vector<std::vector<std::vector<int64_t>>>&                 \
-            vec_input_shapes,                                                 \
-        const std::vector<boost::any>& attrs, const PreviousArgs&... pargs) { \
-      input_type arg = input_shapes[in_idx];                                  \
-      return InferShapeCallHelper<Tail...>::template InferShape<              \
-          in_idx + 1, vec_in_idx, attr_idx>(input_shapes, vec_input_shapes,   \
-                                            attrs, pargs..., arg);            \
-    }                                                                         \
+    const std::vector<paddle::any>& attrs);
+
+#define PD_SPECIALIZE_InferShapeCallHelper_FOR_SHAPE(input_type)               \
+  template <typename... Tail>                                                  \
+  struct InferShapeCallHelper<input_type, Tail...> {                           \
+    template <int in_idx, int vec_in_idx, int attr_idx,                        \
+              typename... PreviousArgs>                                        \
+    static Return InferShape(                                                  \
+        const std::vector<std::vector<int64_t>>& input_shapes,                 \
+        const std::vector<std::vector<std::vector<int64_t>>>&                  \
+            vec_input_shapes,                                                  \
+        const std::vector<paddle::any>& attrs, const PreviousArgs&... pargs) { \
+      input_type arg = input_shapes[in_idx];                                   \
+      return InferShapeCallHelper<Tail...>::template InferShape<               \
+          in_idx + 1, vec_in_idx, attr_idx>(input_shapes, vec_input_shapes,    \
+                                            attrs, pargs..., arg);             \
+    }                                                                          \
   }
 
-#define PD_SPECIALIZE_InferShapeCallHelper_FOR_SHAPES(input_type)             \
-  template <typename... Tail>                                                 \
-  struct InferShapeCallHelper<input_type, Tail...> {                          \
-    template <int in_idx, int vec_in_idx, int attr_idx,                       \
-              typename... PreviousArgs>                                       \
-    static Return InferShape(                                                 \
-        const std::vector<std::vector<int64_t>>& input_shapes,                \
-        const std::vector<std::vector<std::vector<int64_t>>>&                 \
-            vec_input_shapes,                                                 \
-        const std::vector<boost::any>& attrs, const PreviousArgs&... pargs) { \
-      input_type arg = vec_input_shapes[vec_in_idx];                          \
-      return InferShapeCallHelper<Tail...>::template InferShape<              \
-          in_idx, vec_in_idx + 1, attr_idx>(input_shapes, vec_input_shapes,   \
-                                            attrs, pargs..., arg);            \
-    }                                                                         \
+#define PD_SPECIALIZE_InferShapeCallHelper_FOR_SHAPES(input_type)              \
+  template <typename... Tail>                                                  \
+  struct InferShapeCallHelper<input_type, Tail...> {                           \
+    template <int in_idx, int vec_in_idx, int attr_idx,                        \
+              typename... PreviousArgs>                                        \
+    static Return InferShape(                                                  \
+        const std::vector<std::vector<int64_t>>& input_shapes,                 \
+        const std::vector<std::vector<std::vector<int64_t>>>&                  \
+            vec_input_shapes,                                                  \
+        const std::vector<paddle::any>& attrs, const PreviousArgs&... pargs) { \
+      input_type arg = vec_input_shapes[vec_in_idx];                           \
+      return InferShapeCallHelper<Tail...>::template InferShape<               \
+          in_idx, vec_in_idx + 1, attr_idx>(input_shapes, vec_input_shapes,    \
+                                            attrs, pargs..., arg);             \
+    }                                                                          \
   }
 
-#define PD_SPECIALIZE_InferShapeCallHelper_FOR_ATTR(attr_type)                \
-  template <typename... Tail>                                                 \
-  struct InferShapeCallHelper<attr_type, Tail...> {                           \
-    template <int in_idx, int vec_in_idx, int attr_idx,                       \
-              typename... PreviousArgs>                                       \
-    static Return InferShape(                                                 \
-        const std::vector<std::vector<int64_t>>& input_shapes,                \
-        const std::vector<std::vector<std::vector<int64_t>>>&                 \
-            vec_input_shapes,                                                 \
-        const std::vector<boost::any>& attrs, const PreviousArgs&... pargs) { \
-      try {                                                                   \
-        attr_type arg = boost::any_cast<attr_type>(attrs[attr_idx]);          \
-        return InferShapeCallHelper<Tail...>::template InferShape<            \
-            in_idx, vec_in_idx, attr_idx + 1>(input_shapes, vec_input_shapes, \
-                                              attrs, pargs..., arg);          \
-      } catch (boost::bad_any_cast&) {                                        \
-        PD_THROW(                                                             \
-            "Attribute cast error in custom operator InferShapeFn. "          \
-            "Expected " #attr_type                                            \
-            " value. InferShapeFn's attribute list must be exactly same as "  \
-            "Forward "                                                        \
-            "KernelFn's attribute list except std::vector<int64_t> "          \
-            "attribute.");                                                    \
-      }                                                                       \
-    }                                                                         \
+#define PD_SPECIALIZE_InferShapeCallHelper_FOR_ATTR(attr_type)                 \
+  template <typename... Tail>                                                  \
+  struct InferShapeCallHelper<attr_type, Tail...> {                            \
+    template <int in_idx, int vec_in_idx, int attr_idx,                        \
+              typename... PreviousArgs>                                        \
+    static Return InferShape(                                                  \
+        const std::vector<std::vector<int64_t>>& input_shapes,                 \
+        const std::vector<std::vector<std::vector<int64_t>>>&                  \
+            vec_input_shapes,                                                  \
+        const std::vector<paddle::any>& attrs, const PreviousArgs&... pargs) { \
+      try {                                                                    \
+        attr_type arg = paddle::any_cast<attr_type>(attrs[attr_idx]);          \
+        return InferShapeCallHelper<Tail...>::template InferShape<             \
+            in_idx, vec_in_idx, attr_idx + 1>(input_shapes, vec_input_shapes,  \
+                                              attrs, pargs..., arg);           \
+      } catch (paddle::bad_any_cast&) {                                        \
+        PD_THROW(                                                              \
+            "Attribute cast error in custom operator InferShapeFn. "           \
+            "Expected " #attr_type                                             \
+            " value. InferShapeFn's attribute list must be exactly same as "   \
+            "Forward "                                                         \
+            "KernelFn's attribute list except std::vector<int64_t> "           \
+            "attribute.");                                                     \
+      }                                                                        \
+    }                                                                          \
   }
 
 template <typename F, F f>
@@ -276,7 +275,7 @@ struct InferShapeFuncImpl<Return (*)(Args...), impl_fn> {
   static Return InferShape(
       const std::vector<std::vector<int64_t>>& input_shapes,
       const std::vector<std::vector<std::vector<int64_t>>>& vec_input_shapes,
-      const std::vector<boost::any>& attrs) {
+      const std::vector<paddle::any>& attrs) {
     return InferShapeCallHelper<Args..., TypeTag<int>>::template InferShape<
         0, 0, 0>(input_shapes, vec_input_shapes, attrs);
   }
@@ -314,7 +313,7 @@ struct InferShapeFuncImpl<Return (*)(Args...), impl_fn> {
     static Return InferShape(
         const std::vector<std::vector<int64_t>>& input_shapes,
         const std::vector<std::vector<std::vector<int64_t>>>& vec_input_shapes,
-        const std::vector<boost::any>& attrs, const Args&... args) {
+        const std::vector<paddle::any>& attrs, const Args&... args) {
       return impl_fn(args...);
     }
   };
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 08e912f52ccb57..419db670467a01 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -411,6 +411,7 @@ configure_file(commit.h.in commit.h)
 # to avoid exposing the path of the underlying file
 include_directories(${PADDLE_SOURCE_DIR}/paddle/fluid/platform)
 include_directories(${PADDLE_SOURCE_DIR}/paddle/fluid/extension/include)
+include_directories(${PADDLE_SOURCE_DIR}/paddle/utils)
 
 if(WITH_ROCM)
   hip_library(custom_tensor SRCS ../extension/src/ext_tensor.cc DEPS lod_tensor memory enforce)
@@ -427,6 +428,9 @@ else()
   cc_test(custom_tensor_test SRCS custom_tensor_test.cc DEPS custom_tensor glog)
 endif()
 
+#cc_binary(test_executor SRCS test_executor.cc DEPS executor op_registry ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} )
+#cc_binary(new_executor SRCS new_exec_test.cc DEPS operator op_registry executor ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} profiler)
+
 set(FLUID_FRAMEWORK_MODULES proto_desc memory lod_tensor executor data_feed_proto layer dynamic_loader custom_operator)
 
 cc_library(paddle_framework DEPS ${FLUID_FRAMEWORK_MODULES})
diff --git a/paddle/fluid/framework/custom_operator.cc b/paddle/fluid/framework/custom_operator.cc
index b1c5ff86d19790..7fef165f373969 100644
--- a/paddle/fluid/framework/custom_operator.cc
+++ b/paddle/fluid/framework/custom_operator.cc
@@ -34,6 +34,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
 #include "paddle/fluid/string/string_helper.h"
+#include "paddle/utils/any.h"
 
 namespace paddle {
 namespace framework {
@@ -149,7 +150,7 @@ static void RunKernelFunc(const framework::ExecutionContext& ctx,
     }
   }
 
-  std::vector<boost::any> custom_attrs;
+  std::vector<paddle::any> custom_attrs;
   for (auto& attr_str : attrs) {
     auto attr_name_and_type = detail::ParseAttrStr(attr_str);
     auto attr_name = attr_name_and_type[0];
@@ -605,7 +606,7 @@ void RegisterOperatorWithMetaInfo(
         }
       }
 
-      std::vector<boost::any> custom_attrs;
+      std::vector<paddle::any> custom_attrs;
       for (auto& attr_str : op_attrs) {
         auto attr_name_and_type = detail::ParseAttrStr(attr_str);
         auto attr_name = attr_name_and_type[0];
diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index 1546027b794bb5..bbb781c8664baf 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -141,7 +141,7 @@ if(NOT APPLE AND NOT WIN32 AND (WITH_GPU OR WITH_ROCM))
 endif()
 cc_library(build_strategy SRCS build_strategy.cc DEPS pass_builder ${IR_PASS_DEPS})
 cc_test(build_strategy_test SRCS build_strategy_test.cc
-        DEPS build_strategy op_registry op_proto_maker graph)
+        DEPS build_strategy op_registry op_proto_maker graph string_helper)
 
 if (WITH_MKLDNN)
   target_link_libraries(build_strategy mkldnn_placement_pass)
diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto
index b28c884429c179..1de6d26d05b9e4 100644
--- a/paddle/fluid/framework/distributed_strategy.proto
+++ b/paddle/fluid/framework/distributed_strategy.proto
@@ -183,7 +183,7 @@ message DistributedStrategy {
   optional bool use_hierarchical_allreduce = 15 [ default = false ];
   optional int32 hierarchical_allreduce_inter_nranks = 16 [ default = 1 ];
   optional bool sync_batch_norm = 17 [ default = false ];
-  optional bool fuse_all_reduce_ops = 18 [ default = false ];
+  optional bool fuse_all_reduce_ops = 18 [ default = true ];
   optional int32 fuse_grad_size_in_MB = 19 [ default = 32 ];
   optional float fuse_grad_size_in_TFLOPS = 20 [ default = 50 ];
   optional bool cudnn_exhaustive_search = 21 [ default = false ];
diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.cc b/paddle/fluid/framework/fleet/fleet_wrapper.cc
index 54a647a73cfebb..bb318e59e46e41 100644
--- a/paddle/fluid/framework/fleet/fleet_wrapper.cc
+++ b/paddle/fluid/framework/fleet/fleet_wrapper.cc
@@ -262,7 +262,7 @@ void FleetWrapper::HeterPushSparseVars(
     int64_t* ids = tensor->data<int64_t>();
     int slot = 0;
     if (dump_slot) {
-      slot = boost::lexical_cast<int>(sparse_key_names[i]);
+      slot = std::stoi(sparse_key_names[i]);
     }
     Variable* g_var = scope.FindVar(sparse_grad_names[i]);
     if (g_var == nullptr) {
@@ -915,12 +915,17 @@ void FleetWrapper::PushSparseVarsWithLabelAsync(
     int slot = 0;
     if (dump_slot) {
       try {
-        slot = boost::lexical_cast<int>(sparse_key_names[i]);
-      } catch (boost::bad_lexical_cast const& e) {
+        slot = std::stoi(sparse_key_names[i]);
+      } catch (std::invalid_argument const& e) {
         PADDLE_THROW(platform::errors::PreconditionNotMet(
             "sparse var's name: %s, doesn't support non-integer type name when "
             "dump_slot=True",
             sparse_key_names[i]));
+      } catch (std::out_of_range const& e) {
+        PADDLE_THROW(platform::errors::PreconditionNotMet(
+            "sparse var's name: %s, integer type name out of range when "
+            "dump_slot=True",
+            sparse_key_names[i]));
       }
     }
     Variable* g_var = scope.FindVar(sparse_grad_names[i]);
@@ -1121,7 +1126,7 @@ void FleetWrapper::PushSparseFromTensorWithLabelAsync(
         data[click_index] = static_cast<float>(fea_labels.at(input_idx));
       }
       if (dump_slot) {
-        int slot = boost::lexical_cast<int>(input_names[index]);
+        int slot = std::stoi(input_names[index]);
         data[0] = static_cast<float>(slot);
       }
       ++input_idx;
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index 0107f5976499ce..384f80395c7784 100644
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -59,7 +59,7 @@ cc_library(coalesce_grad_tensor_pass SRCS coalesce_grad_tensor_pass.cc DEPS grap
 
 pass_library(graph_to_program_pass base)
 pass_library(graph_viz_pass base)
-pass_library(lock_free_optimize_pass base)
+pass_library(lock_free_optimize_pass base DEPS string_helper)
 pass_library(fc_fuse_pass inference)
 pass_library(map_matmul_to_mul_pass inference)
 pass_library(attention_lstm_fuse_pass inference)
diff --git a/paddle/fluid/framework/ir/adaptive_pool2d_convert_global_pass.cc b/paddle/fluid/framework/ir/adaptive_pool2d_convert_global_pass.cc
index 0e2bb3eaad536f..c280b7c32ed21d 100644
--- a/paddle/fluid/framework/ir/adaptive_pool2d_convert_global_pass.cc
+++ b/paddle/fluid/framework/ir/adaptive_pool2d_convert_global_pass.cc
@@ -60,6 +60,7 @@ AdaptivePool2dConvertGlobalPass::AdaptivePool2dConvertGlobalPass() {
       .IsStringIn({"NHWC", "NCHW"})
       .End()
       .AddAttr("padding_algorithm")
+      .IsOptional()
       .IsStringIn({"EXPLICIT", "SAME", "VALID"})
       .End();
 }
diff --git a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc
index e4ac89f04ff679..3875d856d20bd6 100644
--- a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc
@@ -120,6 +120,7 @@ ConvAffineChannelFusePass::ConvAffineChannelFusePass() {
       .IsType<std::vector<int>>()
       .End()
       .AddAttr("padding_algorithm")
+      .IsOptional()
       .IsStringIn({"EXPLICIT", "SAME", "VALID"})
       .End()
       .AddAttr("groups")
@@ -129,7 +130,7 @@ ConvAffineChannelFusePass::ConvAffineChannelFusePass() {
       .IsType<std::vector<int>>()
       .End()
       .AddAttr("data_format")
-      .IsStringIn({"NCHW", "NHWC"})
+      .IsStringIn({"NCHW", "NHWC", "AnyLayout"})
       .End();
 
   AddOpCompat(OpCompat("affine_channel"))
@@ -267,6 +268,7 @@ ConvEltwiseAddAffineChannelFusePass::ConvEltwiseAddAffineChannelFusePass() {
       .IsType<std::vector<int>>()
       .End()
       .AddAttr("padding_algorithm")
+      .IsOptional()
       .IsStringIn({"EXPLICIT", "SAME", "VALID"})
       .End()
       .AddAttr("groups")
@@ -276,7 +278,7 @@ ConvEltwiseAddAffineChannelFusePass::ConvEltwiseAddAffineChannelFusePass() {
       .IsType<std::vector<int>>()
       .End()
       .AddAttr("data_format")
-      .IsStringIn({"NCHW", "NHWC"})
+      .IsStringIn({"NCHW", "NHWC", "AnyLayout"})
       .End();
   AddOpCompat(OpCompat("affine_channel"))
       .AddInput("X")
diff --git a/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc b/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
index c362eec34b0683..3a012b908482ac 100644
--- a/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
@@ -620,6 +620,7 @@ ConvTransposeBNFusePass::ConvTransposeBNFusePass() {
       .IsType<std::vector<int>>()
       .End()
       .AddAttr("padding_algorithm")
+      .IsOptional()
       .IsStringIn({"EXPLICIT", "SAME", "VALID"})
       .End()
       .AddAttr("data_format")
@@ -663,6 +664,7 @@ ConvTransposeEltwiseAddBNFusePass::ConvTransposeEltwiseAddBNFusePass() {
       .IsType<std::vector<int>>()
       .End()
       .AddAttr("padding_algorithm")
+      .IsOptional()
       .IsStringIn({"EXPLICIT", "SAME", "VALID"})
       .End()
       .AddAttr("data_format")
diff --git a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc
index 573436d393b855..3d1c1eb55aa079 100644
--- a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc
@@ -68,6 +68,7 @@ ConvElementwiseAdd2ActFusePass::ConvElementwiseAdd2ActFusePass() {
       .AddAttr("paddings")
       .End()
       .AddAttr("padding_algorithm")
+      .IsOptional()
       .IsStringIn({"EXPLICIT", "SAME", "VALID"})
       .End()
       .AddAttr("groups")
diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h
index 50c5671cb91a49..21e743e3587d80 100644
--- a/paddle/fluid/framework/ir/graph.h
+++ b/paddle/fluid/framework/ir/graph.h
@@ -25,6 +25,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/variant.h"
+#include "paddle/utils/any.h"
 
 DECLARE_bool(convert_all_blocks);
 
@@ -147,8 +148,8 @@ class Graph {
         platform::errors::PreconditionNotMet(
             "%s attribute not registered for current graph.", attr_name));
     try {
-      return *boost::any_cast<AttrType *>(attrs_.at(attr_name));
-    } catch (boost::bad_any_cast &) {
+      return *paddle::any_cast<AttrType *>(attrs_.at(attr_name));
+    } catch (paddle::bad_any_cast &) {
       PADDLE_THROW(platform::errors::InvalidArgument(
           "Invalid attribute type of %s, expected: %s, received: %s.",
           attr_name, platform::demangle(typeid(AttrType *).name()),  // NOLINT
@@ -426,7 +427,7 @@ class Graph {
   const Graph *main_graph_;  // not owned.
   std::vector<std::unique_ptr<Graph>> sub_graphs_;
 
-  std::map<std::string, boost::any> attrs_;
+  std::map<std::string, paddle::any> attrs_;
   std::map<std::string, std::function<void(void)>> attr_dels_;
   std::map<ir::Node *, std::unique_ptr<ir::Node>> nodes_;
   std::unordered_set<ir::Node *> node_set_;
diff --git a/paddle/fluid/framework/ir/lock_free_optimize_pass.h b/paddle/fluid/framework/ir/lock_free_optimize_pass.h
index 26ec61fd36eb3c..93b6396bf7f310 100644
--- a/paddle/fluid/framework/ir/lock_free_optimize_pass.h
+++ b/paddle/fluid/framework/ir/lock_free_optimize_pass.h
@@ -17,10 +17,9 @@
 #include <string>
 #include <vector>
 
-#include <boost/algorithm/string/predicate.hpp>
-
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/string/string_helper.h"
 
 namespace paddle {
 namespace framework {
@@ -109,7 +108,7 @@ class LockFreeOptimizePass : public Pass {
                                 "Input argument node cannot be nullptr."));
 
     return node->NodeType() == Node::Type::kVariable &&
-           boost::algorithm::ends_with(node->Name(), name);
+           paddle::string::ends_with(node->Name(), name);
   }
 
   inline bool IsVarNameContains(ir::Node* node, const std::string& name) const {
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc
index a7514038d400b6..41539a05b37177 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc
@@ -47,6 +47,7 @@ ConvBiasFusePass::ConvBiasFusePass() {
       .IsType<std::vector<int>>()
       .End()
       .AddAttr("padding_algorithm")
+      .IsOptional()
       .IsStringIn({"EXPLICIT", "SAME", "VALID"})
       .End()
       .AddAttr("groups")
@@ -56,7 +57,7 @@ ConvBiasFusePass::ConvBiasFusePass() {
       .IsType<std::vector<int>>()
       .End()
       .AddAttr("data_format")
-      .IsStringIn({"NCHW", "NHWC"})
+      .IsStringIn({"NCHW", "NHWC", "AnyLayout"})
       .End();
 
   AddOpCompat(OpCompat("elementwise_add"))
@@ -110,6 +111,7 @@ Conv2DTransposeBiasFusePass::Conv2DTransposeBiasFusePass() {
       .IsType<std::vector<int>>()
       .End()
       .AddAttr("padding_algorithm")
+      .IsOptional()
       .IsStringIn({"EXPLICIT", "SAME", "VALID"})
       .End()
       .AddAttr("data_format")
@@ -135,6 +137,7 @@ Conv3DBiasFusePass::Conv3DBiasFusePass() {
       .IsType<std::vector<int>>()
       .End()
       .AddAttr("padding_algorithm")
+      .IsOptional()
       .IsStringIn({"EXPLICIT", "SAME", "VALID"})
       .End()
       .AddAttr("groups")
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc
index bd65ad8e643785..b07cc58959faa0 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc
@@ -158,11 +158,6 @@ void ResidualConnectionMKLDNNFusePass::IdentityFuseHandle::operator()(
   Node* elementwise_add_op;
   Node* elementwise_add_identity;
   Node* elementwise_add_out;
-  if (!pass_->IsCompat(subgraph, graph)) {
-    LOG(WARNING)
-        << "conv_elementwise_add_mkldnn_fuse_pass in op compat failed.";
-    return;
-  }
 
   std::tie(conv_op, conv_input, conv_filter, conv_output) =
       get_node_from_conv_op(subgraph);
@@ -175,6 +170,12 @@ void ResidualConnectionMKLDNNFusePass::IdentityFuseHandle::operator()(
 
   if (HasFusedActivation(conv_op)) return;
 
+  if (!pass_->IsCompat(subgraph, graph)) {
+    LOG(WARNING)
+        << "conv_elementwise_add_mkldnn_fuse_pass in op compat failed.";
+    return;
+  }
+
   conv_op->Op()->SetInput("ResidualData", {elementwise_add_identity->Name()});
   conv_op->Op()->SetOutput("Output", {elementwise_add_out->Name()});
   conv_op->Op()->SetAttr("fuse_residual_connection", true);
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc
index 2483a506a8f934..2b9419a5502f1c 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc
@@ -77,7 +77,7 @@ CPUQuantizeSquashPass::CPUQuantizeSquashPass() {
       .End()
       .AddAttr("data_format")
       .IsOptional()
-      .IsStringIn({"NCHW", "NHWC"})
+      .IsStringIn({"NCHW", "NHWC", "AnyLayout"})
       .End();
 }
 
diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/CMakeLists.txt b/paddle/fluid/framework/ir/multi_devices_graph_pass/CMakeLists.txt
index f945ddbd5d6a31..6764799d828661 100644
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/CMakeLists.txt
@@ -18,4 +18,4 @@ cc_library(fuse_all_reduce_op_pass SRCS fuse_all_reduce_op_pass.cc DEPS graph gr
 cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS all_reduce_op_handle graph graph_helper pass)
 cc_library(backward_optimizer_op_deps_pass SRCS backward_optimizer_op_deps_pass.cc DEPS graph graph_helper pass)
 cc_library(add_reader_dependency_pass SRCS add_reader_dependency_pass.cc DEPS graph graph_helper pass)
-cc_library(fix_op_run_order_pass SRCS fix_op_run_order_pass DEPS graph graph_helper multi_devices_helper pass op_handle_base eager_deletion_op_handle)   
+cc_library(fix_op_run_order_pass SRCS fix_op_run_order_pass.cc DEPS graph graph_helper multi_devices_helper pass op_handle_base eager_deletion_op_handle)
diff --git a/paddle/fluid/framework/ir/node.h b/paddle/fluid/framework/ir/node.h
index d0568f39ef6a45..54bd4376c6e5cb 100644
--- a/paddle/fluid/framework/ir/node.h
+++ b/paddle/fluid/framework/ir/node.h
@@ -23,7 +23,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/var_desc.h"
 #include "paddle/fluid/platform/macros.h"
-
+#include "paddle/utils/any.h"
 namespace paddle {
 namespace framework {
 class OpDesc;
@@ -104,8 +104,8 @@ class Node {
   template <typename T>
   T& Wrapper() {
     try {
-      return *boost::any_cast<T*>(wrapper_);
-    } catch (boost::bad_any_cast&) {
+      return *paddle::any_cast<T*>(wrapper_);
+    } catch (paddle::bad_any_cast&) {
       PADDLE_THROW(platform::errors::InvalidArgument(
           "Invalid wrapper type error, expected %s, actual %s.",
           typeid(T).name(), wrapper_type_.name()));
@@ -277,7 +277,7 @@ class Node {
 
   Node() = delete;
 
-  boost::any wrapper_;
+  paddle::any wrapper_;
   std::function<void(void)> wrapper_deleter_;
   std::type_index wrapper_type_ = std::type_index(typeid(void));
 
diff --git a/paddle/fluid/framework/ir/pass.h b/paddle/fluid/framework/ir/pass.h
index 8fb96bec9cbd56..fecdfc404e6dca 100644
--- a/paddle/fluid/framework/ir/pass.h
+++ b/paddle/fluid/framework/ir/pass.h
@@ -26,6 +26,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/ir/node.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/platform/variant.h"
+#include "paddle/utils/any.h"
 
 namespace paddle {
 namespace framework {
@@ -73,8 +74,8 @@ class Pass {
                       platform::errors::InvalidArgument(
                           "Attribute %s not registered for pass.", attr_name));
     try {
-      return *boost::any_cast<AttrType *>(attrs_.at(attr_name));
-    } catch (boost::bad_any_cast &) {
+      return *paddle::any_cast<AttrType *>(attrs_.at(attr_name));
+    } catch (paddle::bad_any_cast &) {
       auto TypeToString = [](const std::type_info &info) -> std::string {
         if (std::type_index(info) == std::type_index(typeid(bool *))) {
           return "bool";
@@ -166,7 +167,7 @@ class Pass {
 
   // Pass doesn't take ownership. PassRegistrar should delete default_attrs
   void RegisterDefaultPassAttrs(
-      std::map<std::string, boost::any> default_attr_values) {
+      std::map<std::string, paddle::any> default_attr_values) {
     for (auto const &attr_name : default_attr_values) {
       default_pass_attrs_.insert(attr_name.first);
     }
@@ -180,7 +181,7 @@ class Pass {
   std::unordered_set<std::string> required_pass_attrs_;
   std::unordered_set<std::string> default_pass_attrs_;
   std::unordered_set<std::string> required_graph_attrs_;
-  std::map<std::string, boost::any> attrs_;
+  std::map<std::string, paddle::any> attrs_;
   std::map<std::string, std::function<void(void)>> attr_dels_;
 };
 
@@ -290,7 +291,7 @@ struct PassRegistrar : public Registrar {
  private:
   std::unordered_set<std::string> required_pass_attrs_;
   std::unordered_set<std::string> required_graph_attrs_;
-  std::map<std::string, boost::any> default_attr_values_;
+  std::map<std::string, paddle::any> default_attr_values_;
   std::map<std::string, std::function<void(void)>> default_attr_dels_;
 };
 
diff --git a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc
index 068a50a1dc0e9a..b48c8c6e70a939 100644
--- a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc
@@ -243,6 +243,7 @@ QuantDequantFusePass::QuantDequantFusePass() {
       .IsType<std::vector<int>>()
       .End()
       .AddAttr("padding_algorithm")
+      .IsOptional()
       .IsStringIn({"EXPLICIT", "SAME", "VALID"})
       .End()
       .AddAttr("data_format")
diff --git a/paddle/fluid/framework/new_exec.h b/paddle/fluid/framework/new_exec.h
new file mode 100644
index 00000000000000..defa7a967336b5
--- /dev/null
+++ b/paddle/fluid/framework/new_exec.h
@@ -0,0 +1,629 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <iostream>
+#include <string>
+
+#include <chrono>
+#include <map>
+#include <memory>
+#include <unordered_map>
+#include <vector>
+
+#include "paddle/fluid/framework/executor_gc_helper.h"
+#include "paddle/fluid/framework/garbage_collector.h"
+#include "paddle/fluid/framework/new_exec_util.h"
+#include "paddle/fluid/framework/op_info.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/framework/variable_helper.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/init.h"
+
+// USE_OP(fill_constant);
+// USE_OP(elementwise_add);
+
+// using namespace std;
+
+namespace paddle {
+namespace framework {
+
+using std::cerr;
+using std::endl;
+
+using OpKernelComputeFunc = std::function<void(const ExecutionContext&)>;
+using OpKernelMap =
+    std::unordered_map<OpKernelType, OpKernelComputeFunc, OpKernelType::Hash>;
+
+framework::ProgramDesc load_from_file(const std::string& file_name) {
+  std::ifstream fin(file_name, std::ios::in | std::ios::binary);
+  fin.seekg(0, std::ios::end);
+  std::string buffer(fin.tellg(), ' ');
+  fin.seekg(0, std::ios::beg);
+  fin.read(&buffer[0], buffer.size());
+  fin.close();
+
+  ProgramDesc program_desc(buffer);
+  return program_desc;
+}
+
+struct OpKernelFunc {
+  OpKernelComputeFunc compute_func_;
+  OperatorBase* operator_base_;
+};
+
+struct VariableMetaInfo {
+  int var_ref_count_;
+};
+
+struct VariableScope {
+  std::vector<Variable*> var_list;
+  std::map<std::string, int> name2id;
+  std::vector<VariableMetaInfo> vec_meta_info_;
+};
+
+struct NextInstruction {
+  std::vector<size_t> direct_run_;
+};
+
+struct EventInter {};
+
+struct InstructionInfo {
+  std::vector<size_t> dependecy_count_;
+};
+
+struct EventRun {
+  EventInter event_inter;
+  std::vector<size_t> same_device_run_;
+  std::vector<size_t> synchronized_run;
+};
+
+struct Instruction {
+  OpKernelFunc kernel_func_;
+  std::map<std::string, std::vector<int>> input_index_;
+  std::map<std::string, std::vector<int>> output_index_;
+
+  std::vector<size_t> gc_check_var_list;
+  NextInstruction next_instruction_;
+  std::vector<EventInter> vec_event_list_;
+};
+
+struct OpFuncNode {
+  // int unsed;
+  std::map<std::string, std::vector<int>> input_index;
+  std::map<std::string, std::vector<int>> output_index;
+
+  OpKernelComputeFunc kernel_func_;
+};
+
+int convert(const platform::Place& place) {
+  if (is_cpu_place(place)) {
+    return 0;
+  }
+  if (is_gpu_place(place)) {
+    return 1;
+  }
+
+  return -1;
+}
+
+std::vector<size_t> merge_vec(const std::vector<size_t>& first,
+                              const std::vector<size_t>& second) {
+  std::vector<size_t> out(first.size() + second.size());
+  std::merge(first.begin(), first.end(), second.begin(), second.end(),
+             out.begin());
+
+  std::vector<size_t>::iterator it;
+  it = std::unique(out.begin(), out.end());
+
+  out.resize(std::distance(out.begin(), it));
+
+  return out;
+}
+
+void build_variable_outer_scope(const framework::ProgramDesc& pdesc,
+                                VariableScope* var_scope, Scope* outer_scope) {
+  auto& global_block = pdesc.Block(0);
+
+  for (auto& var : global_block.AllVars()) {
+    if (var->Name() == framework::kEmptyVarName) {
+      continue;
+    }
+    auto v = outer_scope->Var(var->Name());
+
+    if (var_scope->name2id.find(var->Name()) == var_scope->name2id.end()) {
+      var_scope->name2id[var->Name()] = var_scope->var_list.size();
+    }
+
+    InitializeVariable(v, var->GetType());
+    var_scope->var_list.push_back(v);
+  }
+}
+
+void build_variable_scope(const framework::ProgramDesc& pdesc,
+                          VariableScope* var_scope) {
+  auto& global_block = pdesc.Block(0);
+
+  for (auto& var : global_block.AllVars()) {
+    if (var->Name() == framework::kEmptyVarName) {
+      continue;
+    }
+
+    if (var_scope->name2id.find(var->Name()) == var_scope->name2id.end()) {
+      var_scope->name2id[var->Name()] = var_scope->var_list.size();
+    }
+
+    auto v = new Variable();
+    InitializeVariable(v, var->GetType());
+    var_scope->var_list.push_back(v);
+  }
+}
+
+void build_op_func_list(const framework::ProgramDesc& pdesc,
+                        std::vector<OperatorBase*>* op_list,
+                        std::vector<OpFuncNode>* vec_func_list,
+                        VariableScope* var_scope,
+                        const platform::Place& place) {
+  auto& global_block = pdesc.Block(0);
+
+  for (auto& op : global_block.AllOps()) {
+    VLOG(3) << op->Type();
+    // << op->Type() << endl;
+
+    auto& info = OpInfoMap::Instance().Get(op->Type());
+
+    const VariableNameMap& inputs_names = op->Inputs();
+    const VariableNameMap& outputs_names = op->Outputs();
+    AttributeMap op_attr_map = op->GetAttrMap();
+
+    if (info.Checker() != nullptr) {
+      info.Checker()->Check(&op_attr_map);
+    }
+    auto op_base =
+        info.Creator()(op->Type(), inputs_names, outputs_names, op_attr_map);
+
+    OpFuncNode op_func_node;
+
+    VariableValueMap ins_map;
+    std::map<std::string, std::vector<int>> ins_name2id;
+    for (auto& var_name_item : inputs_names) {
+      std::vector<Variable*> input_vars;
+      std::vector<int> vec_ids;
+      input_vars.reserve(var_name_item.second.size());
+      for (auto& var_name : var_name_item.second) {
+        auto it = var_scope->name2id.find(var_name);
+        assert(it != var_scope->name2id.end());
+        input_vars.push_back(var_scope->var_list[it->second]);
+        vec_ids.push_back(it->second);
+      }
+      ins_map[var_name_item.first] = input_vars;
+      ins_name2id[var_name_item.first] = vec_ids;
+    }
+
+    VariableValueMap outs_map;
+    std::map<std::string, std::vector<int>> outs_name2id;
+    for (auto& var_name_item : outputs_names) {
+      std::vector<Variable*> output_vars;
+      std::vector<int> vec_ids;
+      output_vars.reserve(var_name_item.second.size());
+      for (auto& var_name : var_name_item.second) {
+        auto it = var_scope->name2id.find(var_name);
+        assert(it != var_scope->name2id.end());
+        output_vars.push_back(var_scope->var_list[it->second]);
+        vec_ids.push_back(it->second);
+      }
+      outs_map[var_name_item.first] = output_vars;
+      outs_name2id[var_name_item.first] = vec_ids;
+    }
+
+    op_func_node.input_index = ins_name2id;
+    op_func_node.output_index = outs_name2id;
+    RuntimeContext runtime_context({}, {});
+    runtime_context.inputs.swap(ins_map);
+    runtime_context.outputs.swap(outs_map);
+    RuntimeInferShapeContext infer_shape_ctx(*op_base, runtime_context);
+    static_cast<const framework::OperatorWithKernel*>(op_base)->InferShape(
+        &infer_shape_ctx);
+    auto& all_op_kernels = OperatorWithKernel::AllOpKernels();
+    auto kernels_iter = all_op_kernels.find(op->Type());
+    PADDLE_ENFORCE_NE(
+        kernels_iter, all_op_kernels.end(),
+        platform::errors::Unavailable(
+            "There are no kernels which are registered in the %s operator.",
+            op->Type()));
+
+    OpKernelMap& kernels = kernels_iter->second;
+    // auto place = platform::CPUPlace();
+    // auto place = platform::CUDAPlace(0);
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    auto* dev_ctx = pool.Get(place);
+    Scope scope;
+    auto exec_ctx =
+        ExecutionContext(*op_base, scope, *dev_ctx, runtime_context);
+    auto expected_kernel_key =
+        dynamic_cast<const framework::OperatorWithKernel*>(op_base)
+            ->GetExpectedKernelType(exec_ctx);
+
+    VariableValueMap& ins_map_temp = runtime_context.inputs;
+
+    for (auto& var_name_item : ins_map_temp) {
+      for (size_t i = 0; i < var_name_item.second.size(); ++i) {
+        auto var = var_name_item.second[i];
+        auto tensor_in = static_cast<const Tensor*>(&(var->Get<LoDTensor>()));
+        if (!tensor_in->IsInitialized()) {
+          continue;
+        }
+        auto kernel_type_for_var =
+            static_cast<const framework::OperatorWithKernel*>(op_base)
+                ->GetKernelTypeForVar(var_name_item.first, *tensor_in,
+                                      expected_kernel_key);
+        if (!platform::is_same_place(kernel_type_for_var.place_,
+                                     expected_kernel_key.place_)) {
+          // need trans place
+          // 1. add var in scope
+          // 2. add copy op
+          std::string new_var_name =
+              "temp_1" + std::to_string(var_scope->var_list.size() + 1);
+          auto v = new Variable();
+          v->GetMutable<LoDTensor>();
+          var_scope->name2id[new_var_name] = var_scope->var_list.size();
+          var_scope->var_list.push_back(v);
+
+          VariableNameMap copy_in_map;
+          auto x_iter = inputs_names.find(var_name_item.first);
+          copy_in_map["X"] = {x_iter->second[i]};
+          VariableNameMap copy_out_map;
+          copy_out_map["Out"] = {new_var_name};
+          AttributeMap attr_map;
+          attr_map["dst_place_type"] = convert(place);
+
+          std::map<std::string, std::vector<int>> copy_ins_name2id;
+          copy_ins_name2id["X"] = ins_name2id[var_name_item.first];
+          std::map<std::string, std::vector<int>> copy_out_name2id;
+          copy_out_name2id["Out"] = {var_scope->name2id[new_var_name]};
+
+          op_func_node.input_index[var_name_item.first][i] =
+              var_scope->name2id[new_var_name];
+
+          VariableValueMap copy_ins_value_map;
+          copy_ins_value_map["X"] = {var};
+          VariableValueMap copy_outs_value_map;
+          copy_outs_value_map["Out"] = {v};
+
+          auto& copy_info = OpInfoMap::Instance().Get("memcpy");
+          auto copy_op = copy_info.Creator()("memcpy", copy_in_map,
+                                             copy_out_map, attr_map);
+          OpFuncNode copy_op_func_node;
+          copy_op_func_node.input_index = copy_ins_name2id;
+          copy_op_func_node.output_index = copy_out_name2id;
+
+          RuntimeContext copy_runtime_context({}, {});
+          copy_runtime_context.inputs.swap(copy_ins_value_map);
+          copy_runtime_context.outputs.swap(copy_outs_value_map);
+          RuntimeInferShapeContext copy_infer_shape_ctx(*copy_op,
+                                                        copy_runtime_context);
+          static_cast<const framework::OperatorWithKernel*>(copy_op)
+              ->InferShape(&copy_infer_shape_ctx);
+          auto& all_op_kernels = OperatorWithKernel::AllOpKernels();
+          auto kernels_iter = all_op_kernels.find("memcpy");
+          PADDLE_ENFORCE_NE(kernels_iter, all_op_kernels.end(),
+                            platform::errors::Unavailable(
+                                "There are no kernels which are registered in "
+                                "the memcpy operator."));
+
+          OpKernelMap& kernels = kernels_iter->second;
+          platform::DeviceContextPool& pool =
+              platform::DeviceContextPool::Instance();
+          auto* dev_ctx = pool.Get(place);
+          Scope scope;
+          auto copy_exec_ctx =
+              ExecutionContext(*copy_op, scope, *dev_ctx, copy_runtime_context);
+          auto expected_kernel_key =
+              dynamic_cast<const framework::OperatorWithKernel*>(copy_op)
+                  ->GetExpectedKernelType(copy_exec_ctx);
+          auto kernel_iter = kernels.find(expected_kernel_key);
+          copy_op_func_node.kernel_func_ =
+              OpKernelComputeFunc(kernel_iter->second);
+          copy_op_func_node.kernel_func_(copy_exec_ctx);
+          op_list->push_back(copy_op);
+          vec_func_list->push_back(copy_op_func_node);
+
+          var_name_item.second[i] = v;
+        }
+      }
+    }
+
+    op_list->push_back(op_base);
+
+    auto kernel_iter = kernels.find(expected_kernel_key);
+    PADDLE_ENFORCE_NE(kernel_iter, kernels.end(),
+                      platform::errors::NotFound(
+                          "Operator (%s) does not have kernel for %s.",
+                          op->Type(), KernelTypeToString(expected_kernel_key)));
+
+    op_func_node.kernel_func_ = OpKernelComputeFunc(kernel_iter->second);
+    op_func_node.kernel_func_(exec_ctx);
+    vec_func_list->push_back(op_func_node);
+  }
+}
+
+class InterpreterCore {
+ public:
+  InterpreterCore(const platform::Place& place, const ProgramDesc& prog,
+                  const ProgramDesc& startup_prog, Scope* scope)
+      : place_(place), prog_(prog), outer_scope_(scope) {
+    paddle::framework::InitDevices();
+
+    is_build_ = false;
+
+    if (outer_scope_ != nullptr) {
+      auto name_list = outer_scope_->LocalVarNames();
+      for (auto name : name_list) {
+        auto v = outer_scope_->Var(name);
+        if (global_scope.name2id.find(name) == global_scope.name2id.end()) {
+          global_scope.name2id[name] = global_scope.var_list.size();
+        }
+
+        global_scope.var_list.push_back(v);
+      }
+    }
+
+    paddle::framework::build_variable_outer_scope(startup_prog, &global_scope,
+                                                  outer_scope_);
+
+    std::vector<paddle::framework::OpFuncNode> vec_func_list;
+    std::vector<paddle::framework::OperatorBase*> op_list;
+    paddle::framework::build_op_func_list(
+        startup_prog, &op_list, &vec_func_list, &global_scope, place_);
+    // add variable to outer_scope
+  }
+  void run(const std::vector<std::string>& vec_name,
+           const std::vector<framework::Tensor>& vec_tensor,
+           const std::vector<std::string>& vec_fetch_name,
+           std::vector<framework::Tensor>* vec_out) {
+    if (is_build_ == false) {
+      paddle::framework::build_variable_scope(prog_, &global_scope);
+    }
+    for (size_t i = 0; i < vec_name.size(); ++i) {
+      auto it = global_scope.name2id.find(vec_name[i]);
+      assert(it != global_scope.name2id.end());
+
+      auto feed_tensor =
+          global_scope.var_list[it->second]->GetMutable<framework::LoDTensor>();
+      feed_tensor->ShareDataWith(vec_tensor[i]);
+    }
+
+    if (is_build_ == false) {
+      paddle::framework::build_op_func_list(prog_, &op_list, &vec_func_list,
+                                            &global_scope, place_);
+      is_build_ = true;
+      // convert vec func_list to graph
+      convert();
+    } else {
+      exec_instruction_list(vec_instruction_, global_scope, place_);
+    }
+
+    for (size_t i = 0; i < vec_fetch_name.size(); ++i) {
+      auto it = global_scope.name2id.find(vec_fetch_name[i]);
+      assert(it != global_scope.name2id.end());
+      PADDLE_ENFORCE_NE(it, global_scope.name2id.end(),
+                        platform::errors::NotFound(
+                            "Can't find (%d) the fetch var (%s) in scope", i,
+                            vec_fetch_name[i]));
+
+      auto fetch_tensor =
+          global_scope.var_list[it->second]->GetMutable<framework::LoDTensor>();
+
+      if (platform::is_gpu_place(fetch_tensor->place())) {
+        Tensor out;
+        platform::DeviceContextPool& pool =
+            platform::DeviceContextPool::Instance();
+        auto* dev_ctx = pool.Get(place_);
+        dev_ctx->Wait();
+        TensorCopySync(*fetch_tensor, platform::CPUPlace(), &out);
+        dev_ctx->Wait();
+        vec_out->push_back(out);
+      } else {
+        Tensor out;
+        TensorCopySync(*fetch_tensor, platform::CPUPlace(), &out);
+        vec_out->push_back(out);
+      }
+    }
+  }
+
+ private:
+  void convert() {
+    input_var2op_info_.resize(global_scope.var_list.size());
+
+    vec_instruction_.reserve(vec_func_list.size());
+    dependecy_count_.resize(vec_func_list.size());
+    global_scope.vec_meta_info_.resize(global_scope.var_list.size());
+    for (size_t i = 0; i < vec_func_list.size(); ++i) {
+      Instruction temp_inst;
+      temp_inst.kernel_func_.compute_func_ = vec_func_list[i].kernel_func_;
+      temp_inst.kernel_func_.operator_base_ = op_list[i];
+      temp_inst.input_index_ = vec_func_list[i].input_index;
+      temp_inst.output_index_ = vec_func_list[i].output_index;
+
+      std::vector<size_t> gc_check_input_list;
+      for (auto& item : vec_func_list[i].input_index) {
+        for (auto id : item.second) {
+          input_var2op_info_[id].push_back(i);
+          gc_check_input_list.push_back(id);
+        }
+      }
+      std::sort(gc_check_input_list.begin(), gc_check_input_list.end());
+      auto last =
+          std::unique(gc_check_input_list.begin(), gc_check_input_list.end());
+      gc_check_input_list.erase(last, gc_check_input_list.end());
+      for (auto var_id : gc_check_input_list) {
+        global_scope.vec_meta_info_[var_id].var_ref_count_++;
+      }
+
+      temp_inst.gc_check_var_list.swap(gc_check_input_list);
+
+      vec_instruction_.push_back(temp_inst);
+    }
+
+    for (size_t i = 0; i < vec_instruction_.size(); ++i) {
+      std::vector<size_t> vec_temp;
+      for (auto& item : vec_instruction_[i].output_index_) {
+        for (auto id : item.second) {
+          vec_temp = merge_vec(vec_temp, input_var2op_info_[id]);
+        }
+      }
+
+      // In Program, op order is a very import information.
+      // Op can noly add op after it as next as next ops.
+      std::vector<size_t> filter_next;
+      filter_next.reserve(vec_temp.size());
+      for (auto item : vec_temp) {
+        if (item > i) {
+          filter_next.push_back(item);
+        }
+      }
+      vec_instruction_[i].next_instruction_.direct_run_ = filter_next;
+
+      // checkout ouput
+      for (auto& item : vec_instruction_[i].output_index_) {
+        for (auto id : item.second) {
+          if (input_var2op_info_[id].size() == 0) {
+            // output var not be used by any kernel
+            vec_instruction_[i].gc_check_var_list.push_back(id);
+            global_scope.vec_meta_info_[id].var_ref_count_++;
+          }
+        }
+      }
+
+      for (auto inst_id : filter_next) {
+        dependecy_count_[inst_id]++;
+      }
+    }
+  }
+
+  void run_instr(const Instruction& instr_node, const VariableScope& var_scope,
+                 const platform::Place& place) {
+    auto op_base = instr_node.kernel_func_.operator_base_;
+    // build runtime cost
+    VariableValueMap ins_map;
+    for (auto& var_name_item : instr_node.input_index_) {
+      std::vector<Variable*> input_vars;
+
+      input_vars.reserve(var_name_item.second.size());
+      for (auto& id : var_name_item.second) {
+        input_vars.emplace_back(var_scope.var_list[id]);
+      }
+      ins_map.emplace(var_name_item.first, std::move(input_vars));
+    }
+
+    VariableValueMap outs_map;
+    for (auto& var_name_item : instr_node.output_index_) {
+      std::vector<Variable*> out_vars;
+
+      out_vars.reserve(var_name_item.second.size());
+      for (auto& id : var_name_item.second) {
+        out_vars.emplace_back(var_scope.var_list[id]);
+      }
+      outs_map.emplace(var_name_item.first, std::move(out_vars));
+    }
+
+    RuntimeContext runtime_context({}, {});
+    runtime_context.inputs.swap(ins_map);
+    runtime_context.outputs.swap(outs_map);
+
+    RuntimeInferShapeContext infer_shape_ctx(*op_base, runtime_context);
+
+    static_cast<const framework::OperatorWithKernel*>(op_base)->InferShape(
+        &infer_shape_ctx);
+
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    auto* dev_ctx = pool.Get(place);
+    Scope scope;
+
+    auto exec_context =
+        ExecutionContext(*op_base, scope, *dev_ctx, runtime_context);
+
+    instr_node.kernel_func_.compute_func_(exec_context);
+  }
+
+  void exec_instruction_list(const std::vector<Instruction>& vec_instr,
+                             const VariableScope& var_scope,
+                             const platform::Place& place) {
+    std::queue<size_t> working_queue;
+    auto working_dependecy_count = dependecy_count_;
+    for (size_t i = 0; i < dependecy_count_.size(); ++i) {
+      if (dependecy_count_[i] == 0) {
+        working_queue.push(i);
+      }
+    }
+
+    auto working_var_ref = global_scope.vec_meta_info_;
+
+    size_t run_op_number = 0;
+    while (!working_queue.empty()) {
+      auto instr_id = working_queue.front();
+      working_queue.pop();
+      auto& instr_node = vec_instr[instr_id];
+      run_instr(instr_node, var_scope, place);
+
+      auto& next_instr = instr_node.next_instruction_.direct_run_;
+      ++run_op_number;
+
+      for (auto next_i : next_instr) {
+        --working_dependecy_count[next_i];
+        if (working_dependecy_count[next_i] == 0) {
+          working_queue.push(next_i);
+        }
+      }
+
+      // GC infomation
+
+      auto& gc_check_list = instr_node.gc_check_var_list;
+      for (auto var_id : gc_check_list) {
+        --working_var_ref[var_id].var_ref_count_;
+      }
+    }
+
+    for (size_t i = 0; i < working_var_ref.size(); ++i) {
+      if (working_var_ref[i].var_ref_count_ != 0) {
+        cerr << " var ref is not zero " << i << endl;
+      }
+    }
+  }
+
+  const platform::Place& place_;
+  const ProgramDesc& prog_;
+  paddle::framework::VariableScope global_scope;
+  std::vector<paddle::framework::OpFuncNode> vec_func_list;
+  std::vector<paddle::framework::OperatorBase*> op_list;
+
+  bool is_build_;
+
+  std::vector<Instruction> vec_instruction_;
+
+  InstructionInfo instruction_info_;
+
+  std::vector<size_t> dependecy_count_;
+  std::vector<VariableMetaInfo> ref_coun_info;
+  std::vector<std::vector<size_t>> input_var2op_info_;
+
+  Scope* outer_scope_;
+};
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/new_exec_test.cc b/paddle/fluid/framework/new_exec_test.cc
new file mode 100644
index 00000000000000..7bfb6b6540cff8
--- /dev/null
+++ b/paddle/fluid/framework/new_exec_test.cc
@@ -0,0 +1,88 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <iostream>
+#include <string>
+
+#include <chrono>
+#include <map>
+#include <memory>
+#include <unordered_map>
+#include <vector>
+
+#include "paddle/fluid/framework/executor_gc_helper.h"
+#include "paddle/fluid/framework/garbage_collector.h"
+#include "paddle/fluid/framework/op_info.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/platform/device_context.h"
+
+#include "paddle/fluid/pybind/pybind.h"
+
+#include "gperftools/profiler.h"
+#include "paddle/fluid/framework/new_exec.h"
+#include "paddle/fluid/platform/init.h"
+
+int main() {
+  paddle::framework::InitDevices();
+  paddle::framework::VariableScope global_scope;
+  auto place = paddle::platform::CUDAPlace(0);
+  auto test_prog = paddle::framework::load_from_file("lm_startup_program");
+  {
+    paddle::framework::build_variable_scope(test_prog, &global_scope);
+
+    std::vector<paddle::framework::OpFuncNode> vec_func_list;
+    std::vector<paddle::framework::OperatorBase*> op_list;
+    paddle::framework::build_op_func_list(test_prog, op_list, vec_func_list,
+                                          &global_scope, place);
+
+    // paddle::framework::exec_op_func_list( vec_func_list, op_list,
+    // global_scope, place );
+  }
+
+  cerr << "run main" << endl;
+  auto main_prog = paddle::framework::load_from_file("lm_main_program");
+
+  paddle::framework::build_variable_scope(main_prog, &global_scope);
+
+  std::vector<paddle::framework::OpFuncNode> vec_main_func_list;
+  std::vector<paddle::framework::OperatorBase*> op_main_list;
+  paddle::framework::build_op_func_list(
+      main_prog, op_main_list, vec_main_func_list, &global_scope, place);
+  paddle::framework::Scope scope;
+  paddle::framework::InterpreterCore interp_core(place, main_prog, test_prog,
+                                                 &scope);
+  auto start = std::chrono::steady_clock::now();
+  ProfilerStart("new_executor.prof");
+  for (size_t i = 0; i < 2320; ++i) {
+    if (i % 200 == 0) {
+      cerr << i << endl;
+    }
+    // paddle::framework::exec_op_func_list( vec_main_func_list, op_main_list,
+    // global_scope, place );
+    std::vector<paddle::framework::Tensor> vec_out;
+    interp_core.run({}, {}, {}, vec_out);
+  }
+  ProfilerStop();
+  auto end = std::chrono::steady_clock::now();
+  std::chrono::duration<double> diff = end - start;
+
+  cerr << "time cost " << diff.count() << endl;
+
+  return 1;
+}
diff --git a/paddle/fluid/framework/new_exec_util.h b/paddle/fluid/framework/new_exec_util.h
new file mode 100644
index 00000000000000..1783b9be74becf
--- /dev/null
+++ b/paddle/fluid/framework/new_exec_util.h
@@ -0,0 +1,472 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/*************************************************************************
+  > File Name: new_exec_util.h
+  > Author: guanshanshan@baidu.com
+  > Created Time: Fri 23 Jul 2021 06:19:19 AM UTC
+ ************************************************************************/
+
+#pragma once
+
+#include <chrono>
+#include <iostream>
+#include <string>
+
+#include <map>
+#include <memory>
+#include <unordered_map>
+#include <vector>
+
+#include "paddle/fluid/framework/executor_gc_helper.h"
+#include "paddle/fluid/framework/garbage_collector.h"
+#include "paddle/fluid/framework/op_info.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/framework/variable_helper.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/init.h"
+
+namespace paddle {
+namespace framework {
+
+class RuntimeInferShapeContext : public InferShapeContext {
+ public:
+  RuntimeInferShapeContext(const OperatorBase& op, const RuntimeContext& ctx)
+      : op_(op), ctx_(ctx) {}
+
+  bool HasInput(const std::string& name) const override {
+    // has only one input
+    const auto& ins = ctx_.inputs;
+    auto it = ins.find(name);
+    if (it == ins.end()) {
+      return false;
+    }
+    const auto& in = it->second;
+    if (in.size() == 0) return false;
+    PADDLE_ENFORCE_EQ(
+        in.size(), 1UL,
+        platform::errors::InvalidArgument(
+            "Input %s should not contain more than one inputs.", name));
+    return in[0] != nullptr;
+  }
+
+  bool HasOutput(const std::string& name) const override {
+    // has only one output
+    const auto& outs = ctx_.outputs;
+    auto it = outs.find(name);
+    if (it == outs.end()) {
+      return false;
+    }
+    const auto& out = it->second;
+    if (out.size() == 0) {
+      return false;
+    }
+    PADDLE_ENFORCE_EQ(
+        out.size(), 1UL,
+        platform::errors::InvalidArgument(
+            "Output %s should not contain more than one outputs.", name));
+    return out[0] != nullptr;
+  }
+
+  bool HasInputs(const std::string& name) const override {
+    const auto& ins = ctx_.inputs;
+    auto it = ins.find(name);
+    if (it == ins.end() || it->second.empty()) {
+      return false;
+    }
+    for (auto& input : it->second) {
+      if (input == nullptr) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  bool HasOutputs(const std::string& name) const override {
+    const auto& outs = ctx_.outputs;
+    auto it = outs.find(name);
+    if (it == outs.end() || it->second.empty()) {
+      return false;
+    }
+    for (auto& output : it->second) {
+      if (output == nullptr) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  AttrReader Attrs() const override { return AttrReader(op_.Attrs()); }
+
+  std::vector<std::string> Inputs(const std::string& name) const override {
+    return op_.Inputs(name);
+  }
+
+  std::vector<std::string> Outputs(const std::string& name) const override {
+    return op_.Outputs(name);
+  }
+
+  std::string GetInputNameByIdx(size_t idx) const override {
+    auto& op_proto =
+        paddle::framework::OpInfoMap::Instance().Get(op_.Type()).proto_;
+    PADDLE_ENFORCE_LT(idx, op_proto->inputs().size(),
+                      platform::errors::OutOfRange(
+                          "The index should be less than the size of inputs of "
+                          "operator %s, but got index is %d and size is %d",
+                          op_.Type(), idx, op_proto->inputs().size()));
+    return op_proto->inputs()[idx].name();
+  }
+
+  std::string GetOutputNameByIdx(size_t idx) const override {
+    auto& op_proto =
+        paddle::framework::OpInfoMap::Instance().Get(op_.Type()).proto_;
+    PADDLE_ENFORCE_LT(
+        idx, op_proto->outputs().size(),
+        platform::errors::OutOfRange(
+            "The index should be less than the size of outputs of "
+            "operator %s, but got index is %d and size is %d",
+            op_.Type(), idx, op_proto->outputs().size()));
+    return op_proto->outputs()[idx].name();
+  }
+
+  void ShareDim(const std::string& in, const std::string& out, size_t i = 0,
+                size_t j = 0) override {
+    auto in_it = ctx_.inputs.find(in);
+    auto out_it = ctx_.outputs.find(out);
+    PADDLE_ENFORCE_NE(
+        in_it, ctx_.inputs.end(),
+        platform::errors::NotFound("Input %s does not exist.", in));
+    PADDLE_ENFORCE_NE(
+        out_it, ctx_.outputs.end(),
+        platform::errors::NotFound("Output %s does not exist.", out));
+    PADDLE_ENFORCE_LT(i, in_it->second.size(),
+                      platform::errors::InvalidArgument(
+                          "The index of input dimension is out of range, "
+                          "excepted index less than %zu, but received %zu.",
+                          in_it->second.size(), i));
+    PADDLE_ENFORCE_LT(j, out_it->second.size(),
+                      platform::errors::InvalidArgument(
+                          "The index of output dimension is out of range, "
+                          "excepted index less than %zu, but received %zu.",
+                          out_it->second.size(), j));
+
+    Variable* in_var = in_it->second[i];
+    Variable* out_var = out_it->second[j];
+
+    PADDLE_ENFORCE_EQ(
+        in_var->Type(), out_var->Type(),
+        platform::errors::InvalidArgument(
+            "The type of input (%s) and output (%s) are inconsistent.", in,
+            out));
+
+    if (in_var->IsType<framework::SelectedRows>()) {
+      auto& in_sele_rows = in_var->Get<framework::SelectedRows>();
+      auto out_sele_rows = out_var->GetMutable<framework::SelectedRows>();
+      out_sele_rows->mutable_value()->Resize(in_sele_rows.value().dims());
+      out_sele_rows->set_rows(in_sele_rows.rows());
+      out_sele_rows->set_height(in_sele_rows.height());
+    } else if (in_var->IsType<framework::LoDTensor>()) {
+      auto& in_lod_tensor = in_var->Get<framework::LoDTensor>();
+      auto* out_lod_tensor = out_var->GetMutable<framework::LoDTensor>();
+      out_lod_tensor->Resize(in_lod_tensor.dims());
+    } else {
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Currently, the input type of ShareDim only can be LoDTensor "
+          "or SelectedRows."));
+    }
+  }
+
+  void ShareAllLoD(const std::string& in,
+                   const std::string& out) const override {
+    auto in_it = ctx_.inputs.find(in);
+    auto out_it = ctx_.outputs.find(out);
+    PADDLE_ENFORCE_NE(in_it, ctx_.inputs.end(),
+                      platform::errors::NotFound(
+                          "Input [%s] found error in Op [%s]", in, op_.Type()));
+    PADDLE_ENFORCE_NE(
+        out_it, ctx_.outputs.end(),
+        platform::errors::NotFound("Output [%s] found error in Op [%s]", out,
+                                   op_.Type()));
+
+    auto& in_var_list = in_it->second;
+    auto& out_var_list = out_it->second;
+
+    PADDLE_ENFORCE_EQ(
+        in_var_list.size(), out_var_list.size(),
+        platform::errors::PreconditionNotMet(
+            "Op [%s]: Input var size should be equal with output var size",
+            op_.Type()));
+
+    auto& out_var_names = op_.Outputs(out);
+
+    for (size_t i = 0; i < in_var_list.size(); ++i) {
+      if (out_var_names[i] == framework::kEmptyVarName) {
+        continue;
+      }
+
+      Variable* in_var = in_var_list[i];
+      if (!in_var->IsType<LoDTensor>()) return;
+      Variable* out_var = out_var_list[i];
+      PADDLE_ENFORCE_EQ(out_var->IsType<LoDTensor>(), true,
+                        platform::errors::PreconditionNotMet(
+                            "The %d-th output of Output(%s) must be LoDTensor.",
+                            i, out_var_names[i]));
+      auto& in_tensor = in_var->Get<LoDTensor>();
+      auto* out_tensor = out_var->GetMutable<LoDTensor>();
+      out_tensor->set_lod(in_tensor.lod());
+#ifdef PADDLE_WITH_MKLDNN
+      if (in_tensor.layout() != DataLayout::kMKLDNN)
+#endif
+        out_tensor->set_layout(in_tensor.layout());
+    }
+  }
+
+  void ShareLoD(const std::string& in, const std::string& out, size_t i = 0,
+                size_t j = 0) const override {
+    auto in_it = ctx_.inputs.find(in);
+    auto out_it = ctx_.outputs.find(out);
+    PADDLE_ENFORCE_NE(
+        in_it, ctx_.inputs.end(),
+        platform::errors::NotFound("Input %s does not exist.", in));
+    PADDLE_ENFORCE_NE(
+        out_it, ctx_.outputs.end(),
+        platform::errors::NotFound("Output %s does not exist.", out));
+    PADDLE_ENFORCE_LT(i, in_it->second.size(),
+                      platform::errors::InvalidArgument(
+                          "The index of input dimension is out of range, "
+                          "excepted index less than %zu, but received %zu.",
+                          in_it->second.size(), i));
+    PADDLE_ENFORCE_LT(j, out_it->second.size(),
+                      platform::errors::InvalidArgument(
+                          "The index of output dimension is out of range, "
+                          "excepted index less than %zu, but received %zu.",
+                          out_it->second.size(), j));
+
+    Variable* in_var = in_it->second.at(i);
+    if (!in_var->IsType<LoDTensor>()) return;
+    Variable* out_var = out_it->second.at(j);
+    PADDLE_ENFORCE_EQ(
+        out_var->IsType<LoDTensor>(), true,
+        platform::errors::InvalidArgument(
+            "The %zu-th output of Output(%s) must be LoDTensor.", j, out));
+    auto& in_tensor = in_var->Get<LoDTensor>();
+    auto* out_tensor = out_var->GetMutable<LoDTensor>();
+    out_tensor->set_lod(in_tensor.lod());
+
+// TODO(dzhwinter) : reuse ShareLoD in most operators.
+// Need to call ShareLayout explicitly in sequence related ops.
+// Shall we have a better method to shared info between in/out Tensor?
+#ifdef PADDLE_WITH_MKLDNN
+    // Fix me: ugly workaround below
+    // Correct solution:
+    //    set_layout() should NOT be called here (i.e. ShareLoD). Instead,
+    //    layout of output tensor should be set "manually" in Compute()
+    //    of each OPKernel. The reason layout should NOT be shared between
+    //    input and output "automatically" (now by InferShape()->ShareLoD())
+    //    is that layout transform may occur after InferShape().
+    // Workaround:
+    //    Skip set_layout() when input layout is kMKLDNN
+    //    This is to avoid kMKLDNN is populated wrongly into a non-MKLDNN
+    //    OPKernel. In all MKLDNN OPkernel, set_layout(kMKLDNN) should be called
+    //    in Compute()
+    if (in_tensor.layout() != DataLayout::kMKLDNN)
+#endif
+      out_tensor->set_layout(in_tensor.layout());
+  }
+
+  int32_t GetLoDLevel(const std::string& in, size_t i = 0) const override {
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "GetLoDLevel is only used in compile time. The calculation of "
+        "output's actual lod is different among operators so that should be "
+        "set in the runtime kernel."));
+  }
+
+  void SetLoDLevel(const std::string& out, int32_t lod_level,
+                   size_t j = 0) const override {
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "SetLoDLevel is only used in compile time. The calculation of "
+        "output's actual lod is different among operators so that should be "
+        "set in the runtime kernel."));
+  }
+
+  bool IsRuntime() const override { return true; }
+
+  // TODO(paddle-dev): Can this be template?
+  std::vector<InferShapeVarPtr> GetInputVarPtrs(
+      const std::string& name) override {
+    const std::vector<Variable*>& vars = InputVars(name);
+    std::vector<InferShapeVarPtr> res;
+    res.reserve(vars.size());
+    res.insert(res.begin(), vars.begin(), vars.end());
+    return res;
+  }
+
+  std::vector<InferShapeVarPtr> GetOutputVarPtrs(
+      const std::string& name) override {
+    const std::vector<Variable*>& vars = OutputVars(name);
+    std::vector<InferShapeVarPtr> res;
+    res.reserve(vars.size());
+    res.insert(res.begin(), vars.begin(), vars.end());
+    return res;
+  }
+
+  DDim GetInputDim(const std::string& name) const override {
+    const std::vector<Variable*>& vars = InputVars(name);
+    PADDLE_ENFORCE_EQ(
+        vars.size(), 1UL,
+        platform::errors::InvalidArgument(
+            "Input(%s) should hold one element, but now it holds %zu elements.",
+            name, vars.size()));
+    return this->GetDim(vars[0]);
+  }
+
+  std::vector<DDim> GetInputsDim(const std::string& name) const override {
+    const std::vector<Variable*>& vars = InputVars(name);
+    return GetDims(vars);
+  }
+
+  std::vector<proto::VarType::Type> GetInputsVarType(
+      const std::string& name) const override {
+    return GetVarTypes(InputVars(name));
+  }
+
+  std::vector<proto::VarType::Type> GetOutputsVarType(
+      const std::string& name) const override {
+    return GetVarTypes(OutputVars(name));
+  }
+
+  void SetOutputDim(const std::string& name, const DDim& dim) override {
+    auto& vars = OutputVars(name);
+    PADDLE_ENFORCE_EQ(
+        vars.size(), 1UL,
+        platform::errors::InvalidArgument("Output(%s) should hold one element, "
+                                          "but now it holds %zu elements.",
+                                          name, vars.size()));
+    SetDim(vars[0], dim);
+  }
+
+  void SetOutputsDim(const std::string& name,
+                     const std::vector<DDim>& dims) override {
+    auto& vars = OutputVars(name);
+    SetDims(vars, dims);
+  }
+
+ protected:
+  DDim GetDim(Variable* var) const {
+    PADDLE_ENFORCE_NOT_NULL(
+        var, platform::errors::InvalidArgument("Input variable is nullptr."));
+    if (var->IsType<LoDTensor>()) {
+      return var->Get<LoDTensor>().dims();
+    } else if (var->IsType<SelectedRows>()) {
+      return var->Get<SelectedRows>().GetCompleteDims();
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Only LoDTensor or SelectedRows support 'GetDim', but input "
+          "Variable's type is %s.",
+          ToTypeName(var->Type())));
+    }
+  }
+
+  std::vector<DDim> GetDims(const std::vector<Variable*>& vars) const {
+    std::vector<DDim> ret;
+    ret.reserve(vars.size());
+    std::transform(vars.begin(), vars.end(), std::back_inserter(ret),
+                   [this](Variable* var) { return this->GetDim(var); });
+    return ret;
+  }
+
+  std::vector<DDim> GetRepeatedDims(const std::string& name) const override {
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "GetRepeatedDims method only ban be used in compile time."));
+  }
+
+  void SetDim(Variable* var, const DDim& dim) {
+    if (var->IsType<LoDTensor>()) {
+      var->GetMutable<LoDTensor>()->Resize(dim);
+    } else if (var->IsType<SelectedRows>()) {
+      var->GetMutable<SelectedRows>()->set_height(dim[0]);
+    } else {
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Variable type error, expect LoDTensor or SelectedRows, but received "
+          "(%s).",
+          ToTypeName(var->Type())));
+    }
+  }
+
+  void SetDims(const std::vector<Variable*>& vars,
+               const std::vector<DDim>& dims) {
+    size_t length = vars.size();
+    PADDLE_ENFORCE_EQ(length, dims.size(),
+                      platform::errors::InvalidArgument(
+                          "The number of input variables do not match the "
+                          "number of input dimensions, the number of variables "
+                          "is %zu, the number of dimensions is %zu.",
+                          length, dims.size()));
+    for (size_t i = 0; i < length; ++i) {
+      if (vars[i] == nullptr) {
+        continue;
+      }
+      SetDim(vars[i], dims[i]);
+    }
+  }
+
+  void SetRepeatedDims(const std::string& name,
+                       const std::vector<DDim>& dims) override {
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "SetRepeatedDims method only can be used in compile time."));
+  }
+
+  std::vector<proto::VarType::Type> GetVarTypes(
+      const std::vector<Variable*>& vars) const {
+    std::vector<proto::VarType::Type> retv;
+    retv.resize(vars.size());
+    std::transform(vars.begin(), vars.end(), retv.begin(),
+                   std::bind(std::mem_fn(&RuntimeInferShapeContext::GetVarType),
+                             this, std::placeholders::_1));
+    return retv;
+  }
+
+  proto::VarType::Type GetVarType(Variable* var) const {
+    return ToVarType(var->Type());
+  }
+
+ private:
+  const std::vector<Variable*>& InputVars(const std::string& name) const {
+    auto it = ctx_.inputs.find(name);
+    PADDLE_ENFORCE_NE(
+        it, ctx_.inputs.end(),
+        platform::errors::NotFound(
+            "Operator (%s) does not have the input (%s).", op_.Type(), name));
+    return it->second;
+  }
+
+  const std::vector<Variable*>& OutputVars(const std::string& name) const {
+    auto it = ctx_.outputs.find(name);
+    PADDLE_ENFORCE_NE(
+        it, ctx_.outputs.end(),
+        platform::errors::NotFound(
+            "Operator (%s) does not have the outputs (%s).", op_.Type(), name));
+    return it->second;
+  }
+
+  const OperatorBase& op_;
+  const RuntimeContext& ctx_;
+};
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 0f7012940d76b0..6a9f5577705335 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1254,9 +1254,10 @@ void OperatorWithKernel::ChooseKernel(const RuntimeContext& ctx,
   }
 #endif
 #ifdef PADDLE_WITH_XPU
-  if (kernel_iter == kernels.end() &&
-      is_xpu_place(expected_kernel_key.place_) &&
-      !paddle::platform::is_xpu_support_op(type_, expected_kernel_key)) {
+  if ((kernel_iter == kernels.end() &&
+       is_xpu_place(expected_kernel_key.place_) &&
+       !paddle::platform::is_xpu_support_op(type_, expected_kernel_key)) ||
+      paddle::platform::is_in_xpu_black_list(type_)) {
     VLOG(3) << "missing XPU kernel: " << type_
             << ", expected_kernel_key:" << expected_kernel_key
             << ", fallbacking to CPU one!";
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index 619d31c4f5b257..93f2fd38a73064 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -131,9 +131,10 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
   auto& kernels = kernels_iter->second;
   auto kernel_iter = kernels.find(expected_kernel_key);
 #ifdef PADDLE_WITH_XPU
-  if (kernel_iter == kernels.end() &&
-      is_xpu_place(expected_kernel_key.place_) &&
-      !paddle::platform::is_xpu_support_op(op.Type(), expected_kernel_key)) {
+  if ((kernel_iter == kernels.end() &&
+       is_xpu_place(expected_kernel_key.place_) &&
+       !paddle::platform::is_xpu_support_op(op.Type(), expected_kernel_key)) ||
+      paddle::platform::is_in_xpu_black_list(op.Type())) {
     VLOG(3) << "missing XPU kernel: " << op.Type()
             << ", expected_kernel_key:" << expected_kernel_key
             << ", fallbacking to CPU one!";
diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc
index 3d97d68b5c7dfd..9dc9c4d90acaba 100644
--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
@@ -30,6 +30,8 @@ DECLARE_string(tracer_mkldnn_ops_off);
 namespace paddle {
 namespace imperative {
 
+thread_local bool Tracer::has_grad_ = true;
+
 static std::shared_ptr<Tracer> g_current_tracer(nullptr);
 
 const std::shared_ptr<Tracer>& GetCurrentTracer() { return g_current_tracer; }
diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h
index 8f50550878262f..b734ae5c499369 100644
--- a/paddle/fluid/imperative/tracer.h
+++ b/paddle/fluid/imperative/tracer.h
@@ -118,9 +118,9 @@ class Tracer {
   bool enable_program_desc_tracing_{false};
   std::unique_ptr<UniqueNameGenerator> generator_;
   platform::Place expected_place_;
-  bool has_grad_{true};
   bool enable_autocast_{false};
   GarbageCollectorMap gcs_;
+  static thread_local bool has_grad_;
 };
 
 // To access static variable current_tracer
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index c002c7a10cb7b3..6567c41ee1fedc 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -39,6 +39,7 @@ get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
 # Adapt to custom op mechanism: Include the header files related to the data type
 # to avoid exposing the path of the underlying file
 include_directories(${PADDLE_SOURCE_DIR}/paddle/fluid/platform)
+include_directories(${PADDLE_SOURCE_DIR}/paddle/utils)
 
 add_subdirectory(api)
 
diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
index 5ed6691ebb8673..b117a21dea3e65 100644
--- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
@@ -65,10 +65,13 @@ T *Tensor::mutable_data(PlaceType place) {
     case static_cast<int>(PlaceType::kXPU): {
       return tensor->mutable_data<T>(paddle::platform::XPUPlace(device_));
     }
+    case static_cast<int>(PlaceType::kNPU): {
+      return tensor->mutable_data<T>(paddle::platform::NPUPlace(device_));
+    }
     default:
       PADDLE_THROW(paddle::platform::errors::Unavailable(
-          "Only CPU / CUDA / XPU places is supported. The place `%d` is not "
-          "supported.",
+          "Only CPU / CUDA / XPU / NPU places is supported. The place `%d` is "
+          "not supported.",
           static_cast<int>(place)));
       break;
   }
@@ -86,6 +89,8 @@ T *Tensor::data(PlaceType *place, int *size) const {
     *place = PlaceType::kGPU;
   } else if (paddle::platform::is_xpu_place(tensor->place())) {
     *place = PlaceType::kXPU;
+  } else if (paddle::platform::is_npu_place(tensor->place())) {
+    *place = PlaceType::kNPU;
   } else {
     *place = PlaceType::kUNK;
   }
diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor_test.cc b/paddle/fluid/inference/api/details/zero_copy_tensor_test.cc
index 7e709924e91f93..0c092a8684d1ad 100644
--- a/paddle/fluid/inference/api/details/zero_copy_tensor_test.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor_test.cc
@@ -133,6 +133,14 @@ TEST(Tensor, FillRandomDataAndCheck) {
   ASSERT_TRUE(FillRandomDataAndCheck(PlaceType::kGPU));
   ASSERT_TRUE(SetPlaceAndCheck(PlaceType::kGPU));
 #endif
+#ifdef PADDLE_WITH_ASCEND_CL
+  ASSERT_TRUE(FillRandomDataAndCheck(PlaceType::kNPU));
+  ASSERT_TRUE(SetPlaceAndCheck(PlaceType::kNPU));
+#endif
+#ifdef PADDLE_WITH_XPU
+  ASSERT_TRUE(FillRandomDataAndCheck(PlaceType::kXPU));
+  ASSERT_TRUE(SetPlaceAndCheck(PlaceType::kXPU));
+#endif
 }
 
 }  // namespace paddle_infer
diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc
index d9e87122ac258c..dbaaf2bdc7c098 100644
--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -89,7 +89,6 @@ void TensorRTEngine::FreezeNetwork() {
   if (enable_int8) {
     infer_builder_config_->setFlag(nvinfer1::BuilderFlag::kFP16);
     infer_builder_config_->setFlag(nvinfer1::BuilderFlag::kINT8);
-    infer_builder_config_->setFlag(nvinfer1::BuilderFlag::kSTRICT_TYPES);
 
     if (calibrator_) {
       infer_builder_config_->setInt8Calibrator(calibrator_);
diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h
index 38c453bde6d2db..3604a47a7eb90b 100644
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -32,6 +32,7 @@ limitations under the License. */
 #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
 #include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h"
 #include "paddle/fluid/inference/utils/singleton.h"
+#include "paddle/utils/any.h"
 
 namespace paddle {
 namespace framework {
@@ -425,8 +426,8 @@ class TensorRTEngine {
                       platform::errors::InvalidArgument(
                           "Attribute %s not found in trt engine.", attr_name));
     try {
-      return *boost::any_cast<AttrType*>(attrs_.at(attr_name));
-    } catch (boost::bad_any_cast&) {
+      return *paddle::any_cast<AttrType*>(attrs_.at(attr_name));
+    } catch (paddle::bad_any_cast&) {
       auto TypeToString = [](const std::type_info& info) -> std::string {
         if (std::type_index(info) == std::type_index(typeid(bool*))) {
           return "bool";
@@ -504,7 +505,7 @@ class TensorRTEngine {
   infer_ptr<nvinfer1::IHostMemory> ihost_memory_;
   std::unordered_map<nvinfer1::ITensor*, float> quant_dynamic_range_;
 
-  std::unordered_map<std::string, boost::any> attrs_;
+  std::unordered_map<std::string, paddle::any> attrs_;
   std::unordered_map<std::string, std::function<void(void)>> attr_dels_;
 
   // For dynamic shape
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index 2829a740236d27..bfe3dfc85eecdd 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -703,8 +703,9 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
         return false;
       }
       // Paddle-TRT does not support the input tensors: Shape and ShapeTensor
-      if (desc.Input("Shape").size() >= 1 ||
-          desc.Input("ShapeTensor").size() >= 1) {
+      auto reshape_inputs = desc.Inputs();
+      if (reshape_inputs.find("Shape") != reshape_inputs.end() ||
+          reshape_inputs.find("ShapeTensor") != reshape_inputs.end()) {
         return false;
       }
       std::vector<int> shape =
diff --git a/paddle/fluid/memory/allocation/npu_pinned_allocator.cc b/paddle/fluid/memory/allocation/npu_pinned_allocator.cc
index 507a8589d94ddd..9178825efa9e1d 100644
--- a/paddle/fluid/memory/allocation/npu_pinned_allocator.cc
+++ b/paddle/fluid/memory/allocation/npu_pinned_allocator.cc
@@ -39,6 +39,7 @@ void NPUPinnedAllocator::ProcessEventsAndFree() {
 }
 
 Allocation *NPUPinnedAllocator::AllocateImpl(size_t size) {
+  std::lock_guard<std::mutex> lock(mtx_);
   ProcessEventsAndFree();
   void *ptr;
   int error = posix_memalign(&ptr, kAlignment, size);
@@ -50,6 +51,7 @@ Allocation *NPUPinnedAllocator::AllocateImpl(size_t size) {
 }
 
 void NPUPinnedAllocator::FreeImpl(Allocation *allocation) {
+  std::lock_guard<std::mutex> lock(mtx_);
   void *ptr = allocation->ptr();
   auto iter = npu_events_.find(allocation);
   aclrtEvent event = iter->second;
@@ -65,11 +67,14 @@ void NPUPinnedAllocator::FreeImpl(Allocation *allocation) {
 }
 
 uint64_t NPUPinnedAllocator::ReleaseImpl(const platform::Place &place) {
+  std::lock_guard<std::mutex> lock(mtx_);
+  // Empty implementation
   return static_cast<uint64_t>(0);
 }
 
 void NPUPinnedAllocator::RecordEvent(Allocation *allocation,
                                      aclrtStream stream) {
+  std::lock_guard<std::mutex> lock(mtx_);
   aclrtEvent event = nullptr;
   PADDLE_ENFORCE_NPU_SUCCESS(aclrtCreateEvent(&event));
   PADDLE_ENFORCE_NPU_SUCCESS(aclrtRecordEvent(event, stream));
diff --git a/paddle/fluid/memory/allocation/npu_pinned_allocator.h b/paddle/fluid/memory/allocation/npu_pinned_allocator.h
index 4c856b931ee2cf..b330b6e352ce42 100644
--- a/paddle/fluid/memory/allocation/npu_pinned_allocator.h
+++ b/paddle/fluid/memory/allocation/npu_pinned_allocator.h
@@ -42,6 +42,7 @@ class NPUPinnedAllocator : public Allocator {
 
  private:
   std::unordered_map<Allocation *, aclrtEvent> npu_events_;
+  mutable std::mutex mtx_;
 };
 
 }  // namespace allocation
diff --git a/paddle/fluid/operators/activation_op_npu.cc b/paddle/fluid/operators/activation_op_npu.cc
index 1ccd99c71f339a..02ce817bcc8b2b 100644
--- a/paddle/fluid/operators/activation_op_npu.cc
+++ b/paddle/fluid/operators/activation_op_npu.cc
@@ -144,6 +144,47 @@ class ReluGradNPUKernel : public framework::OpKernel<T> {
   }
 };
 
+template <typename DeviceContext, typename T>
+class Relu6NPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* out = ctx.Output<Tensor>("Out");
+
+    out->mutable_data<T>(ctx.GetPlace());
+
+    const auto& runner = NpuOpRunner("Relu6",
+                                     {
+                                         *x,
+                                     },
+                                     {*out}, {});
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    runner.Run(stream);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class Relu6GradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* out = ctx.Input<Tensor>("Out");
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    dx->mutable_data<T>(ctx.GetPlace());
+    const auto& runner = NpuOpRunner("Relu6Grad", {*dout, *out}, {*dx}, {});
+
+    runner.Run(stream);
+  }
+};
+
 template <typename DeviceContext, typename T>
 class SqrtNPUKernel : public framework::OpKernel<T> {
  public:
@@ -431,6 +472,94 @@ class ReciprocalGradNPUKernel : public framework::OpKernel<T> {
   }
 };
 
+template <typename DeviceContext, typename T>
+class CosNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* out = ctx.Output<Tensor>("Out");
+
+    auto place = ctx.GetPlace();
+    out->mutable_data<T>(place);
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    const auto& runner = NpuOpRunner("Cos", {*x}, {*out}, {});
+    runner.Run(stream);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class CosGradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* x = ctx.Input<Tensor>("X");
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+
+    auto place = ctx.GetPlace();
+    dx->mutable_data<T>(place);
+
+    Tensor sin_out(x->type());  // Temporary Tensor
+    sin_out.Resize(x->dims());
+    sin_out.mutable_data<T>(place);
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    const auto& runner = NpuOpRunner("Sin", {*x}, {sin_out}, {});
+    runner.Run(stream);
+
+    const auto& runner_dx = NpuOpRunner("Mul", {*dout, sin_out}, {*dx}, {});
+    runner_dx.Run(stream);
+
+    Tensor tmp(x->type());  // Temporary Tensor
+    tmp.Resize(framework::make_ddim({1, 1}));
+    tmp.mutable_data<T>(place);
+    float factor = -1.;
+    FillNpuTensorWithConstant<T>(&tmp, static_cast<T>(factor));
+
+    const auto& runner_dx_ = NpuOpRunner("Xdivy", {*dx, tmp}, {*dx}, {});
+    runner_dx_.Run(stream);
+    // dx = -dout * Sine(x);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class AtanNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* out = ctx.Output<Tensor>("Out");
+    auto place = ctx.GetPlace();
+    out->mutable_data<T>(place);
+    const auto& runner = NpuOpRunner("Atan", {*x}, {*out}, {});
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    runner.Run(stream);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class AtanGradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* x = ctx.Input<Tensor>("X");
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto place = ctx.GetPlace();
+    dx->mutable_data<T>(place);
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    const auto& runner_dx = NpuOpRunner("AtanGrad", {*x, *dout}, {*dx}, {});
+    runner_dx.Run(stream);
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -457,6 +586,17 @@ REGISTER_OP_NPU_KERNEL(
     ops::ReluGradNPUKernel<paddle::platform::NPUDeviceContext,
                            paddle::platform::float16>);
 
+REGISTER_OP_NPU_KERNEL(
+    relu6, ops::Relu6NPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::Relu6NPUKernel<paddle::platform::NPUDeviceContext,
+                        paddle::platform::float16>);
+
+REGISTER_OP_NPU_KERNEL(
+    relu6_grad,
+    ops::Relu6GradNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::Relu6GradNPUKernel<paddle::platform::NPUDeviceContext,
+                            paddle::platform::float16>);
+
 REGISTER_OP_NPU_KERNEL(
     sqrt, ops::SqrtNPUKernel<paddle::platform::NPUDeviceContext, float>,
     ops::SqrtNPUKernel<paddle::platform::NPUDeviceContext,
@@ -531,3 +671,24 @@ REGISTER_OP_NPU_KERNEL(
     ops::ReciprocalGradNPUKernel<paddle::platform::NPUDeviceContext, double>,
     ops::ReciprocalGradNPUKernel<paddle::platform::NPUDeviceContext,
                                  paddle::platform::float16>);
+
+REGISTER_OP_NPU_KERNEL(
+    cos, ops::CosNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::CosNPUKernel<paddle::platform::NPUDeviceContext,
+                      paddle::platform::float16>);
+
+REGISTER_OP_NPU_KERNEL(
+    cos_grad, ops::CosGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::CosGradNPUKernel<paddle::platform::NPUDeviceContext,
+                          paddle::platform::float16>);
+
+REGISTER_OP_NPU_KERNEL(
+    atan, ops::AtanNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::AtanNPUKernel<paddle::platform::NPUDeviceContext,
+                       paddle::platform::float16>);
+
+REGISTER_OP_NPU_KERNEL(
+    atan_grad,
+    ops::AtanGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::AtanGradNPUKernel<paddle::platform::NPUDeviceContext,
+                           paddle::platform::float16>);
diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc
index b2cffc3f9063c1..be17bf9a03fc19 100644
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -295,8 +295,7 @@ class BatchNormKernel<platform::CPUDeviceContext, T>
     bool global_stats = test_mode || use_global_stats;
 
     const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-    const DataLayout data_layout =
-        framework::StringToDataLayout(data_layout_str);
+    DataLayout data_layout = framework::StringToDataLayout(data_layout_str);
 
     const auto *x = ctx.Input<Tensor>("X");
     const auto &x_dims = x->dims();
@@ -332,6 +331,12 @@ class BatchNormKernel<platform::CPUDeviceContext, T>
     saved_mean->mutable_data<T>(ctx.GetPlace());
     saved_variance->mutable_data<T>(ctx.GetPlace());
 
+    // input dimension is 2 and the format is NCHW. The input can be regarded
+    // as NHWC format
+    if (x_dims.size() == 2 && data_layout == DataLayout::kNCHW) {
+      data_layout = DataLayout::kNHWC;
+    }
+
     if (!global_stats) {
       // saved_xx is use just in this batch of data
       EigenVectorArrayMap<T> saved_mean_e(
@@ -578,8 +583,7 @@ class BatchNormGradKernel<platform::CPUDeviceContext, T>
     bool use_global_stats = ctx.Attr<bool>("use_global_stats");
     const bool is_test = ctx.Attr<bool>("is_test");
     const float epsilon = ctx.Attr<float>("epsilon");
-    const DataLayout data_layout =
-        framework::StringToDataLayout(data_layout_str);
+    DataLayout data_layout = framework::StringToDataLayout(data_layout_str);
 
     auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
@@ -633,6 +637,12 @@ class BatchNormGradKernel<platform::CPUDeviceContext, T>
                                           : x_dims[x_dims.size() - 1]);
     const int sample_size = x->numel() / N / C;
 
+    // input dimension is 2 and the format is NCHW. The input can be regarded as
+    // NHWC format
+    if (x_dims.size() == 2 && data_layout == DataLayout::kNCHW) {
+      data_layout = DataLayout::kNHWC;
+    }
+
     // init output
     if (d_x) {
       d_x->mutable_data<T>(ctx.GetPlace());
diff --git a/paddle/fluid/operators/collective/CMakeLists.txt b/paddle/fluid/operators/collective/CMakeLists.txt
index 3f210219608fb7..bd88c8f9cd2b40 100644
--- a/paddle/fluid/operators/collective/CMakeLists.txt
+++ b/paddle/fluid/operators/collective/CMakeLists.txt
@@ -59,6 +59,8 @@ if(WITH_ASCEND_CL)
         DEPS send_v2_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
     cc_test(recv_v2_op_npu_test SRCS recv_v2_op_npu_test.cc
         DEPS recv_v2_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
+    cc_test(checknumeric SRCS checknumeric_npu_test.cc
+        DEPS c_allreduce_sum_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
     cc_test(c_sync_comm_stream_op_npu_test SRCS c_sync_comm_stream_op_npu_test.cc
         DEPS op_registry c_broadcast_op c_comm_init_hccl_op c_sync_comm_stream_op c_gen_hccl_id_op gen_hccl_id_op_helper ${COLLECTIVE_DEPS} ascend_hccl dynamic_loader dynload_warpctc scope device_context enforce executor)
     cc_test(c_sync_calc_stream_op_npu_test SRCS c_sync_calc_stream_op_npu_test.cc
diff --git a/paddle/fluid/operators/collective/c_allreduce_op.h b/paddle/fluid/operators/collective/c_allreduce_op.h
index 3c51c65b073904..1076e84e613f4a 100644
--- a/paddle/fluid/operators/collective/c_allreduce_op.h
+++ b/paddle/fluid/operators/collective/c_allreduce_op.h
@@ -121,35 +121,44 @@ class CAllReduceOpCPUKernel : public framework::OpKernel<T> {
 };
 
 #if defined(PADDLE_WITH_ASCEND_CL)
-// return true if found_inf_or_nan or return false;
-template <typename T>
-bool CheckNumerics(const framework::ExecutionContext& exe_ctx,
-                   aclrtStream stream, const paddle::framework::Tensor* in) {
-  auto& dev_ctx =
-      exe_ctx.template device_context<paddle::platform::NPUDeviceContext>();
+// return true if found_nan or return false;
+inline bool ContainsNan(const paddle::platform::NPUDeviceContext& dev_ctx,
+                        aclrtStream stream,
+                        const paddle::framework::Tensor* in) {
   using Tensor = paddle::framework::Tensor;
   Tensor out(in->type());
-  out.Resize(in->dims());
-  out.mutable_data<T>(dev_ctx.GetPlace());
 
-  bool found_inf_data = false;
+  Tensor mean(in->type());
+  mean.Resize({1});
+  mean.mutable_data<float>(dev_ctx.GetPlace());
+  std::vector<int> axes;
+  for (int i = 0; i < in->dims().size(); ++i) {
+    axes.push_back(i);
+  }
 
+  std::vector<float> vec;
   try {
-    const auto& runner =
-        NpuOpRunner("CheckNumerics", {*in}, {out},
-                    {{"message", std::string("check_numberics")}});
-    runner.Run(stream);
-    dev_ctx.Wait();
-  } catch (platform::EnforceNotMet& exception) {
-    LOG(WARNING) << "[check_nan_and_inf] detected contains NaN or INF!!!";
-    found_inf_data = true;
+    const auto& runner_mean = paddle::operators::NpuOpRunner(
+        "ReduceMeanD", {*in}, {mean}, {{"axes", axes}, {"keep_dims", false}});
+    TensorToVector(mean, dev_ctx, &vec);
   } catch (...) {
-    LOG(WARNING) << "[check_nan_and_inf] detected contains NaN or INF!!!";
-    found_inf_data = true;
+    LOG(WARNING) << "ContainsNan catch exception";
+    return true;
+  }
+
+  VLOG(4) << "reducemeand result:" << vec[0];
+  if (std::isnan(static_cast<float>(vec[0]))) {
+    LOG(WARNING) << "ContainsNan detects nan";
+    return true;
+  }
+
+  if (std::isinf(static_cast<float>(vec[0]))) {
+    LOG(WARNING) << "ContainsNan detects inf";
   }
 
-  return found_inf_data;
+  return false;
 }
+
 #endif
 
 template <ReduceType red_type, typename T>
@@ -216,22 +225,24 @@ class CAllReduceOpASCENDKernel : public framework::OpKernel<T> {
     framework::Tensor tmp;
     tmp.mutable_data<float>({8}, ctx.GetPlace());
 
-    bool check_numerics = false;
+    bool found_nan = false;
 
     auto d_type = in->type();
     switch (d_type) {
-      case framework::proto::VarType::FP16:
+      case framework::proto::VarType::FP16: {
+        break;
+      }
       case framework::proto::VarType::FP32: {
         VLOG(4) << "prepare to FoundNanInf";
-        check_numerics = CheckNumerics<T>(ctx, dev_ctx->stream(), in);
-        VLOG(4) << "check_numerics:" << check_numerics;
+        found_nan = ContainsNan(*dev_ctx, dev_ctx->stream(), in);
+        VLOG(4) << "check_numerics:" << found_nan;
         break;
       }
       default:
         break;
     }
 
-    if (check_numerics) {
+    if (found_nan) {
       T inf = static_cast<T>(std::numeric_limits<float>::infinity());
       VLOG(4) << "fill input data constant inf";
       auto dims = in->dims();
diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc
index f1bf9683e35593..ecf9f18d40f86d 100644
--- a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc
@@ -38,6 +38,11 @@ limitations under the License. */
 #include "paddle/fluid/platform/hccl_helper.h"
 #endif
 
+// Node1: HCCL_WHITELIST_DISABLE=1 FLAGS_selected_npus=1 GLOG_v=4 RANK_ID=1
+// DEVICE_ID=1 ./paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test
+// Node2: HCCL_WHITELIST_DISABLE=1 FLAGS_selected_npus=0 GLOG_v=4 RANK_ID=0
+// DEVICE_ID=0 ./paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test
+
 namespace f = paddle::framework;
 namespace p = paddle::platform;
 namespace m = paddle::operators::math;
@@ -52,10 +57,11 @@ DECLARE_string(selected_npus);
 template <typename T>
 void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
   std::string debugstring = "";
+  std::cout << preStr << ":" << std::endl << debugstring;
   for (auto ele : data) {
-    debugstring += std::to_string(ele) + std::string(",");
+    std::cout << ele << " ";
   }
-  VLOG(3) << preStr << ":" << std::endl << debugstring;
+  std::cout << std::endl;
 }
 
 void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx,
@@ -120,6 +126,7 @@ void Prepare(f::Scope* scope, const p::DeviceContext& ctx,
   ctx.Wait();
 }
 
+template <typename T>
 void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx,
                          int iter) {
   // init
@@ -130,10 +137,11 @@ void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx,
   int num1 = 3;
   int num2 = 128;
 
-  std::vector<float> init;
+  std::vector<T> init;
   for (int64_t i = 0; i < num1 * num2; ++i) {
-    init.push_back(1.0 + rank_id);
+    init.push_back(static_cast<T>(1.0 + rank_id));
   }
+  init[0] = static_cast<T>(std::numeric_limits<float>::quiet_NaN());
   PrintDebugInfo("input data", init);
 
   auto place = ctx.GetPlace();
@@ -145,31 +153,33 @@ void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx,
   auto out = scope->Var("OutData");
   auto tensor_out = out->GetMutable<f::LoDTensor>();
   tensor_out->Resize({num1, num2});
-  tensor_out->mutable_data<float>(place);  // allocate
+  tensor_out->mutable_data<T>(place);  // allocate
   ctx.Wait();
 
   // run
   f::AttributeMap attrs;
   attrs["tag"] = std::string("tagx_" + std::to_string(iter));
   attrs["ring_id"] = 0;
+  attrs["use_calc_stream"] = 1;
 
   auto op = f::OpRegistry::CreateOp("c_allreduce_sum", {{"X", {"Data"}}},
                                     {{"Out", {"OutData"}}}, attrs);
-
-  for (int i = 0; i < 10; i++) {
+  for (int i = 0; i < 1; i++) {
     op->Run(*scope, place);
   }
   ctx.Wait();
 
-  std::vector<float> out_vec;
+  std::vector<T> out_vec;
   TensorToVector(*tensor_out, ctx, &out_vec);
   ctx.Wait();
 
   PrintDebugInfo("output data", out_vec);
 
+  float diff = static_cast<float>(out_vec[0]) - 65504;
+  EXPECT_TRUE(diff < 0.1 && diff > -0.1);
   EXPECT_EQ(out_vec.size(), init.size());
-  for (uint32_t i = 0; i < out_vec.size(); i++) {
-    EXPECT_EQ(out_vec[i], 3.0);
+  for (uint32_t i = 1; i < 10; i++) {
+    EXPECT_EQ(out_vec[i], static_cast<paddle::platform::float16>(3.0));
   }
 }
 
@@ -182,8 +192,7 @@ TEST(c_allreduce_sum, NPU) {
   // only support one device, if more than one device, use first default
   PrepareUniqueId(&scope, ctx, &hccl_id);
   Prepare(&scope, ctx, &hccl_id);
-  for (int i = 0; i < 1; i++) {
-    VLOG(2) << "iter num: " << i;
-    TestHCCLAllReduceOp(&scope, ctx, i);
-  }
+
+  TestHCCLAllReduceOp<paddle::platform::float16>(&scope, ctx, 1);
+  // TestHCCLAllReduceOp<float>(&scope, ctx, 0);
 }
diff --git a/paddle/fluid/operators/collective/checknumeric_npu_test.cc b/paddle/fluid/operators/collective/checknumeric_npu_test.cc
new file mode 100644
index 00000000000000..804e8c2a2cbe0c
--- /dev/null
+++ b/paddle/fluid/operators/collective/checknumeric_npu_test.cc
@@ -0,0 +1,99 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <stdio.h>
+#include <cmath>
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+
+#include "paddle/fluid/operators/collective/c_allreduce_op.h"
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(c_allreduce_sum);
+USE_OP_DEVICE_KERNEL(c_allreduce_sum, NPU);
+DECLARE_string(selected_npus);
+
+template <typename T>
+bool Check(T value, int size = 2 * 512 * 8192) {
+  f::Scope scope;
+  auto x = scope.Var("in");
+  auto& ctx = *dynamic_cast<p::NPUDeviceContext*>(
+      p::DeviceContextPool::Instance().Get(p::NPUPlace(0)));
+  auto place = ctx.GetPlace();
+
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+  tensor_x->Resize({size});
+  tensor_x->mutable_data<T>(place);  // allocate
+
+  std::vector<T> init;
+  for (int64_t i = 0; i < size; ++i) {
+    init.push_back(static_cast<T>(value));
+  }
+
+  TensorFromVector(init, ctx, tensor_x);
+  bool result = paddle::operators::ContainsNan(ctx, ctx.stream(), tensor_x);
+  return result;
+}
+
+TEST(check_numeric, NPU) {
+  auto inf = std::numeric_limits<float>::infinity();
+  auto fp16_inf = static_cast<p::float16>(inf);
+  auto nan = NAN;
+  auto fp16_nan = static_cast<p::float16>(nan);
+
+  bool result = false;
+  // Normal
+  VLOG(0) << "start normal";
+  result = Check<p::float16>(static_cast<p::float16>(65546));
+  ASSERT_FALSE(result);
+  Check<float>(static_cast<float>(1.0));
+  ASSERT_FALSE(result);
+
+  // Inf
+  VLOG(0) << "start inf";
+  result = Check<p::float16>(fp16_inf);
+  ASSERT_FALSE(result);
+  result = Check<float>(inf);
+  ASSERT_FALSE(result);
+
+  // Nan
+  VLOG(0) << "start nan";
+  result = Check<p::float16>(fp16_nan);
+  ASSERT_TRUE(result);
+  result = Check<float>(nan);
+  ASSERT_TRUE(result);
+}
diff --git a/paddle/fluid/operators/controlflow/compare_op_npu.cc b/paddle/fluid/operators/controlflow/compare_op_npu.cc
index b1d4d1e7022a32..235d44b92f9195 100644
--- a/paddle/fluid/operators/controlflow/compare_op_npu.cc
+++ b/paddle/fluid/operators/controlflow/compare_op_npu.cc
@@ -11,21 +11,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <algorithm>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/controlflow/compare_op.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 #include "paddle/fluid/operators/npu_op_runner.h"
-#ifdef PADDLE_WITH_ASCEND_CL
 
 namespace paddle {
 namespace operators {
 
-template <typename T>
+template <typename DeviceContext, typename T>
 class EqualNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -42,16 +36,33 @@ class EqualNPUKernel : public framework::OpKernel<T> {
   }
 };
 
+template <typename DeviceContext, typename T>
+class NotEqualNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::LoDTensor>("X");
+    auto* y = ctx.Input<framework::LoDTensor>("Y");
+    auto* out = ctx.Output<framework::LoDTensor>("Out");
+    out->mutable_data<bool>(ctx.GetPlace());
+
+    const auto& runner = NpuOpRunner("NotEqual", {*x, *y}, {*out}, {});
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    runner.Run(stream);
+  }
+};
+
 template <typename DeviceContext, typename T>
 class LessThanNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* x = ctx.Input<framework::LoDTensor>("X");
     auto* y = ctx.Input<framework::LoDTensor>("Y");
-    auto* z = ctx.Output<framework::LoDTensor>("Out");
-    // int axis = context.Attr<int>("axis");
-    z->mutable_data<bool>(ctx.GetPlace());  // allocate
-    const auto& runner = NpuOpRunner("Less", {*x, *y}, {*z});
+    auto* out = ctx.Output<framework::LoDTensor>("Out");
+    out->mutable_data<bool>(ctx.GetPlace());
+
+    const auto& runner = NpuOpRunner("Less", {*x, *y}, {*out}, {});
     auto stream =
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
@@ -65,9 +76,10 @@ class LessEqualNPUKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* x = ctx.Input<framework::LoDTensor>("X");
     auto* y = ctx.Input<framework::LoDTensor>("Y");
-    auto* z = ctx.Output<framework::LoDTensor>("Out");
-    z->mutable_data<bool>(ctx.GetPlace());
-    const auto& runner = NpuOpRunner("LessEqual", {*x, *y}, {*z});
+    auto* out = ctx.Output<framework::LoDTensor>("Out");
+    out->mutable_data<bool>(ctx.GetPlace());
+
+    const auto& runner = NpuOpRunner("LessEqual", {*x, *y}, {*out}, {});
     auto stream =
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
@@ -81,10 +93,10 @@ class GreaterThanNPUKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* x = ctx.Input<framework::LoDTensor>("X");
     auto* y = ctx.Input<framework::LoDTensor>("Y");
-    auto* z = ctx.Output<framework::LoDTensor>("Out");
+    auto* out = ctx.Output<framework::LoDTensor>("Out");
 
-    z->mutable_data<bool>(ctx.GetPlace());
-    const auto& runner = NpuOpRunner("Greater", {*x, *y}, {*z});
+    out->mutable_data<bool>(ctx.GetPlace());
+    const auto& runner = NpuOpRunner("Greater", {*x, *y}, {*out}, {});
     auto stream =
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
@@ -98,10 +110,10 @@ class GreaterEqualNPUKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* x = ctx.Input<framework::LoDTensor>("X");
     auto* y = ctx.Input<framework::LoDTensor>("Y");
-    auto* z = ctx.Output<framework::LoDTensor>("Out");
+    auto* out = ctx.Output<framework::LoDTensor>("Out");
 
-    z->mutable_data<bool>(ctx.GetPlace());
-    const auto& runner = NpuOpRunner("GreaterEqual", {*x, *y}, {*z});
+    out->mutable_data<bool>(ctx.GetPlace());
+    const auto& runner = NpuOpRunner("GreaterEqual", {*x, *y}, {*out}, {});
     auto stream =
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
@@ -115,32 +127,64 @@ class GreaterEqualNPUKernel : public framework::OpKernel<T> {
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
-REGISTER_OP_NPU_KERNEL(equal, ops::EqualNPUKernel<float>,
-                       ops::EqualNPUKernel<plat::float16>,
-                       ops::EqualNPUKernel<int>);
+REGISTER_OP_NPU_KERNEL(
+    equal, ops::EqualNPUKernel<plat::NPUDeviceContext, plat::float16>,
+    ops::EqualNPUKernel<plat::NPUDeviceContext, float>,
+    ops::EqualNPUKernel<plat::NPUDeviceContext, double>,
+    ops::EqualNPUKernel<plat::NPUDeviceContext, int8_t>,
+    ops::EqualNPUKernel<plat::NPUDeviceContext, uint8_t>,
+    ops::EqualNPUKernel<plat::NPUDeviceContext, int16_t>,
+    ops::EqualNPUKernel<plat::NPUDeviceContext, int>,
+    ops::EqualNPUKernel<plat::NPUDeviceContext, int64_t>,
+    ops::EqualNPUKernel<plat::NPUDeviceContext, bool>);
+
+REGISTER_OP_NPU_KERNEL(
+    not_equal, ops::NotEqualNPUKernel<plat::NPUDeviceContext, plat::float16>,
+    ops::NotEqualNPUKernel<plat::NPUDeviceContext, float>,
+    ops::NotEqualNPUKernel<plat::NPUDeviceContext, double>,
+    ops::NotEqualNPUKernel<plat::NPUDeviceContext, int8_t>,
+    ops::NotEqualNPUKernel<plat::NPUDeviceContext, uint8_t>,
+    ops::NotEqualNPUKernel<plat::NPUDeviceContext, int16_t>,
+    ops::NotEqualNPUKernel<plat::NPUDeviceContext, int>,
+    ops::NotEqualNPUKernel<plat::NPUDeviceContext, int64_t>);
 
 REGISTER_OP_NPU_KERNEL(
-    less_than,
-    ops::LessThanNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::LessThanNPUKernel<paddle::platform::NPUDeviceContext,
-                           paddle::platform::float16>);
+    less_than, ops::LessThanNPUKernel<plat::NPUDeviceContext, plat::float16>,
+    ops::LessThanNPUKernel<plat::NPUDeviceContext, float>,
+    ops::LessThanNPUKernel<plat::NPUDeviceContext, double>,
+    ops::LessThanNPUKernel<plat::NPUDeviceContext, int8_t>,
+    ops::LessThanNPUKernel<plat::NPUDeviceContext, uint8_t>,
+    ops::LessThanNPUKernel<plat::NPUDeviceContext, int16_t>,
+    ops::LessThanNPUKernel<plat::NPUDeviceContext, int>,
+    ops::LessThanNPUKernel<plat::NPUDeviceContext, int64_t>);
 
 REGISTER_OP_NPU_KERNEL(
-    less_equal,
-    ops::LessEqualNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::LessEqualNPUKernel<paddle::platform::NPUDeviceContext,
-                            paddle::platform::float16>);
+    less_equal, ops::LessEqualNPUKernel<plat::NPUDeviceContext, plat::float16>,
+    ops::LessEqualNPUKernel<plat::NPUDeviceContext, float>,
+    ops::LessEqualNPUKernel<plat::NPUDeviceContext, double>,
+    ops::LessEqualNPUKernel<plat::NPUDeviceContext, int8_t>,
+    ops::LessEqualNPUKernel<plat::NPUDeviceContext, uint8_t>,
+    ops::LessEqualNPUKernel<plat::NPUDeviceContext, int16_t>,
+    ops::LessEqualNPUKernel<plat::NPUDeviceContext, int>,
+    ops::LessEqualNPUKernel<plat::NPUDeviceContext, int64_t>);
 
 REGISTER_OP_NPU_KERNEL(
     greater_than,
-    ops::GreaterThanNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::GreaterThanNPUKernel<paddle::platform::NPUDeviceContext,
-                              paddle::platform::float16>);
+    ops::GreaterThanNPUKernel<plat::NPUDeviceContext, plat::float16>,
+    ops::GreaterThanNPUKernel<plat::NPUDeviceContext, float>,
+    ops::GreaterThanNPUKernel<plat::NPUDeviceContext, double>,
+    ops::GreaterThanNPUKernel<plat::NPUDeviceContext, int8_t>,
+    ops::GreaterThanNPUKernel<plat::NPUDeviceContext, uint8_t>,
+    ops::GreaterThanNPUKernel<plat::NPUDeviceContext, int16_t>,
+    ops::GreaterThanNPUKernel<plat::NPUDeviceContext, int>,
+    ops::GreaterThanNPUKernel<plat::NPUDeviceContext, int64_t>);
 
 REGISTER_OP_NPU_KERNEL(
     greater_equal,
-    ops::GreaterEqualNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::GreaterEqualNPUKernel<paddle::platform::NPUDeviceContext,
-                               paddle::platform::float16>);
-
-#endif
+    ops::GreaterEqualNPUKernel<plat::NPUDeviceContext, plat::float16>,
+    ops::GreaterEqualNPUKernel<plat::NPUDeviceContext, float>,
+    ops::GreaterEqualNPUKernel<plat::NPUDeviceContext, double>,
+    ops::GreaterEqualNPUKernel<plat::NPUDeviceContext, int8_t>,
+    ops::GreaterEqualNPUKernel<plat::NPUDeviceContext, uint8_t>,
+    ops::GreaterEqualNPUKernel<plat::NPUDeviceContext, int>,
+    ops::GreaterEqualNPUKernel<plat::NPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.h b/paddle/fluid/operators/elementwise/elementwise_add_op.h
index a469ebbaec2edc..ad9066540c23bf 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.h
@@ -17,7 +17,6 @@ limitations under the License. */
 #include <utility>
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.cu.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/math_function.h"
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc
index 72d7e318d7b052..1ba6c4cb1932b1 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_npu.h"
 #include "paddle/fluid/operators/npu_op_runner.h"
 
 namespace paddle {
@@ -27,12 +28,37 @@ template <typename T>
 class ElementwiseAddNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& dev_ctx =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>();
     auto* x = ctx.Input<framework::LoDTensor>("X");
     auto* y = ctx.Input<framework::LoDTensor>("Y");
     auto* out = ctx.Output<framework::LoDTensor>("Out");
     out->mutable_data<T>(ctx.GetPlace());
 
-    const auto& runner = NpuOpRunner("Add", {*x, *y}, {*out}, {});
+    int axis = ctx.Attr<int>("axis");
+
+    bool direct_compute = false;
+    auto x_dims = x->dims();
+    auto y_dims = y->dims();
+    axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
+    if (x_dims.size() >= y_dims.size()) {
+      direct_compute =
+          y_dims == framework::slice_ddim(x_dims, axis, x_dims.size());
+    } else {
+      direct_compute =
+          x_dims == framework::slice_ddim(y_dims, axis, y_dims.size());
+    }
+
+    Tensor transformed_x, transformed_y;
+    if (direct_compute) {
+      transformed_x.ShareDataWith(*x);
+      transformed_y.ShareDataWith(*y);
+    } else {
+      NpuElementWiseOpBroadcast<T>(dev_ctx, x, y, axis, &transformed_x,
+                                   &transformed_y);
+    }
+    const auto& runner =
+        NpuOpRunner("Add", {transformed_x, transformed_y}, {*out}, {});
     auto stream =
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
@@ -44,109 +70,75 @@ template <typename T>
 class ElementwiseAddGradNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    // NOTE(zhiqiu): It seems Ascend Sub follow the broadcast sematics with
-    // default axis=-1?
-    // So, the sub_grad should do reduce if needed.
-    // For example, the shape of each variable in elementwise_sub:
-    // x, dx: [2, 3, 5]
-    // y, dy: [1, 5]
-    // out, dout: [2, 3, 5]
-    // Then, out = x - y  =>  dx = dout, dy = -dout
-    // And, the shape of dy can be computed by two stages reduce,
-    // 1. [2, 3, 5] => [3, 5], ReduceSumD on axis = 0, keep_dims = false.
-    // 2. [3, 5] => [1, 5], ReduceSumD on axis = 0, keep_dims = true.
-
+    auto& dev_ctx =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>();
+    auto* x = ctx.Input<framework::Tensor>("X");
+    auto* y = ctx.Input<framework::Tensor>("Y");
+    auto* dout = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
+    int axis = ctx.Attr<int>("axis");
+
+    axis = (axis == -1 ? std::abs(x->dims().size() - y->dims().size()) : axis);
+    auto stream = dev_ctx.stream();
     if (dx) {
       dx->mutable_data<T>(ctx.GetPlace());
-      // For dx
-      // stage 1
-      auto reduce_ndim = dout->dims().size() - dx->dims().size();
-      std::vector<int> axes;
-      for (auto i = 0; i < reduce_ndim; ++i) {
-        axes.push_back(i);
-      }
-      Tensor* tmp_dout = const_cast<Tensor*>(dout);
-      Tensor reduced_dout(dx->type());
-      if (axes.size() != 0) {
-        std::vector<int64_t> reduced_dout_dims;
-        for (auto i = reduce_ndim; i < dout->dims().size(); ++i) {
-          reduced_dout_dims.push_back(dout->dims()[i]);
+      if (dx->dims() != dout->dims()) {
+        std::vector<int> dst_dims_vec;
+        std::vector<int> reduce_axes;
+        auto src_dims = dx->dims();
+        auto dout_dims = dout->dims();
+
+        int src_axis = (src_dims.size() < dout_dims.size() ? axis : 0);
+        for (int ax = 0; ax < dout_dims.size(); ++ax) {
+          if ((ax < src_axis || ax >= src_axis + src_dims.size()) ||
+              (dout_dims[ax] > 1 && src_dims[ax - src_axis] == 1)) {
+            reduce_axes.push_back(ax);
+          } else {
+            dst_dims_vec.push_back(dout_dims[ax]);
+          }
         }
-        reduced_dout.Resize(framework::make_ddim(reduced_dout_dims));
-        reduced_dout.mutable_data<T>(ctx.GetPlace());
-        const auto& runner =
-            NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout},
-                        {{"axes", axes}, {"keep_dims", false}});
-        runner.Run(stream);
-        tmp_dout = &reduced_dout;
-      }
-
-      // stage 2
-      axes.clear();
-      for (auto i = 0; i < dx->dims().size(); ++i) {
-        if (dx->dims()[i] == 1) {
-          axes.push_back(i);
+        if (!reduce_axes.empty()) {
+          Tensor tmp;
+          tmp.ShareDataWith(*dx);
+          tmp.Resize(framework::make_ddim(dst_dims_vec));
+          const auto& runner =
+              NpuOpRunner("ReduceSumD", {*dout}, {tmp},
+                          {{"axes", reduce_axes}, {"keep_dims", false}});
+          runner.Run(stream);
         }
-      }
-      if (axes.size() != 0) {
-        const auto& runner = NpuOpRunner("ReduceSumD", {*tmp_dout}, {*dx},
-                                         {{"axes", axes}, {"keep_dims", true}});
-        runner.Run(stream);
       } else {
-        framework::TensorCopy(
-            *tmp_dout, ctx.GetPlace(),
-            ctx.template device_context<platform::DeviceContext>(), dx);
+        framework::TensorCopy(*dout, ctx.GetPlace(), dev_ctx, dx);
       }
     }
-
     if (dy) {
-      // For dy
-      // stage 1
-      auto reduce_ndim = dout->dims().size() - dy->dims().size();
-      std::vector<int> axes;
-      for (auto i = 0; i < reduce_ndim; ++i) {
-        axes.push_back(i);
-      }
-      Tensor* tmp_dout = const_cast<Tensor*>(dout);
-      Tensor reduced_dout(dout->type());
-      if (axes.size() != 0) {
-        std::vector<int64_t> reduced_dout_dims;
-        for (auto i = reduce_ndim; i < dout->dims().size(); ++i) {
-          reduced_dout_dims.push_back(dout->dims()[i]);
+      dy->mutable_data<T>(ctx.GetPlace());
+      if (dy->dims() != dout->dims()) {
+        std::vector<int> dst_dims_vec;
+        std::vector<int> reduce_axes;
+        auto src_dims = dy->dims();
+        auto dout_dims = dout->dims();
+
+        int src_axis = (src_dims.size() < dout_dims.size() ? axis : 0);
+        for (int ax = 0; ax < dout_dims.size(); ++ax) {
+          if ((ax < src_axis || ax >= src_axis + src_dims.size()) ||
+              (dout_dims[ax] > 1 && src_dims[ax - src_axis] == 1)) {
+            reduce_axes.push_back(ax);
+          } else {
+            dst_dims_vec.push_back(dout_dims[ax]);
+          }
         }
-        reduced_dout.Resize(framework::make_ddim(reduced_dout_dims));
-        reduced_dout.mutable_data<T>(ctx.GetPlace());
-        const auto& runner =
-            NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout},
-                        {{"axes", axes}, {"keep_dims", false}});
-        runner.Run(stream);
-        tmp_dout = &reduced_dout;
-      }
-
-      // stage 2
-      axes.clear();
-      for (auto i = 0; i < dy->dims().size(); ++i) {
-        if (dy->dims()[i] == 1) {
-          axes.push_back(i);
+        if (!reduce_axes.empty()) {
+          Tensor tmp;
+          tmp.ShareDataWith(*dy);
+          tmp.Resize(framework::make_ddim(dst_dims_vec));
+          const auto& runner =
+              NpuOpRunner("ReduceSumD", {*dout}, {tmp},
+                          {{"axes", reduce_axes}, {"keep_dims", false}});
+          runner.Run(stream);
         }
-      }
-      if (axes.size() != 0) {
-        dy->mutable_data<T>(ctx.GetPlace());
-        const auto& runner = NpuOpRunner("ReduceSumD", {*tmp_dout}, {*dy},
-                                         {{"axes", axes}, {"keep_dims", true}});
-        runner.Run(stream);
       } else {
-        framework::TensorCopy(
-            *tmp_dout, ctx.GetPlace(),
-            ctx.template device_context<platform::DeviceContext>(), dy);
+        framework::TensorCopy(*dout, ctx.GetPlace(), dev_ctx, dy);
       }
     }
   }
diff --git a/paddle/fluid/operators/elementwise/elementwise_npu.h b/paddle/fluid/operators/elementwise/elementwise_npu.h
new file mode 100644
index 00000000000000..5ee1ebda90f44c
--- /dev/null
+++ b/paddle/fluid/operators/elementwise/elementwise_npu.h
@@ -0,0 +1,135 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+
+template <typename T>
+void NpuBroadcast(const platform::NPUDeviceContext& dev_ctx, const Tensor* src,
+                  int axis, const framework::DDim& dst_dims,
+                  Tensor* transformed_src) {
+  auto stream = dev_ctx.stream();
+
+  // 1. expand the axis with dim 1
+  auto src_dims = src->dims();
+  Tensor tmp_src;
+  tmp_src.ShareDataWith(*src);
+  tmp_src.Resize(src_dims);
+  for (int i = 0; i < src_dims.size(); ++i) {
+    if (src_dims[i] == 1 && dst_dims[i + axis] > 1) {
+      Tensor tmp_tensor;
+      auto tmp_tensor_dims = tmp_src.dims();
+      tmp_tensor_dims[i] = dst_dims[i + axis];
+      tmp_tensor.mutable_data<T>(tmp_tensor_dims, dev_ctx.GetPlace());
+      const auto& runner =
+          NpuOpRunner("TileWithAxis", {tmp_src}, {tmp_tensor},
+                      {{"axis", static_cast<int64_t>(i)},
+                       {"tiles", static_cast<int64_t>(dst_dims[i + axis])}});
+      runner.Run(stream);
+      tmp_src.ShareDataWith(tmp_tensor);
+      tmp_src.Resize(tmp_tensor_dims);
+    }
+  }
+
+  // 2.expand the ahead axis
+  auto prev = framework::product(framework::slice_ddim(dst_dims, 0, axis));
+  if (prev > 1) {
+    Tensor tmp_tensor;
+    auto tmp_tensor_dims =
+        framework::slice_ddim(dst_dims, 0, axis + src_dims.size());
+    tmp_tensor.mutable_data<T>(tmp_tensor_dims, dev_ctx.GetPlace());
+    const auto& runner = NpuOpRunner(
+        "ExpandD", {tmp_src}, {tmp_tensor},
+        {{"shape", framework::vectorize<int64_t>(tmp_tensor_dims)}});
+    runner.Run(stream);
+    tmp_src.ShareDataWith(tmp_tensor);
+    tmp_src.Resize(tmp_tensor_dims);
+  } else {
+    tmp_src.Resize(framework::slice_ddim(dst_dims, 0, axis + src_dims.size()));
+  }
+
+  // 3.expand the tail axis
+  auto post = framework::product(
+      framework::slice_ddim(dst_dims, axis + src_dims.size(), dst_dims.size()));
+  if (post > 1) {
+    auto src_dims_vec = framework::vectorize<int>(tmp_src.dims());
+    src_dims_vec.push_back(1);
+    tmp_src.Resize(framework::make_ddim(src_dims_vec));
+
+    Tensor tmp_tensor;
+    tmp_tensor.mutable_data<T>(dst_dims, dev_ctx.GetPlace());
+    const auto& runner =
+        NpuOpRunner("TileWithAxis", {tmp_src}, {tmp_tensor},
+                    {{"axis", static_cast<int64_t>(axis + src_dims.size())},
+                     {"tiles", static_cast<int64_t>(post)}});
+    runner.Run(stream);
+    tmp_src.ShareDataWith(tmp_tensor);
+  }
+  tmp_src.Resize(dst_dims);
+  framework::TensorCopy(tmp_src, dev_ctx.GetPlace(), transformed_src);
+}
+
+template <typename T>
+void NpuElementWiseOpBroadcast(const platform::NPUDeviceContext& dev_ctx,
+                               const Tensor* x, const Tensor* y, int axis,
+                               Tensor* transformed_x, Tensor* transformed_y) {
+  auto x_dims = x->dims();
+  auto y_dims = y->dims();
+  bool is_xsize_larger = true;
+  int max_dim = x_dims.size();
+  std::vector<int> dst_dims_vec = framework::vectorize<int>(x_dims);
+
+  if (x_dims.size() < y_dims.size()) {
+    is_xsize_larger = false;
+    max_dim = y_dims.size();
+    dst_dims_vec = framework::vectorize<int>(y_dims);
+  }
+
+  axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
+  int x_axis = is_xsize_larger ? 0 : axis;
+  int y_axis = is_xsize_larger ? axis : 0;
+
+  PADDLE_ENFORCE_GE(
+      axis, 0,
+      platform::errors::InvalidArgument(
+          "Axis should be great than or equal to 0, but received axis is %d.",
+          axis));
+  PADDLE_ENFORCE_LT(axis, max_dim,
+                    platform::errors::InvalidArgument(
+                        "Axis should be less than %d, but received axis is %d.",
+                        max_dim, axis));
+
+  for (int i = 0; i < x_dims.size(); ++i) {
+    dst_dims_vec[i + x_axis] =
+        std::max(dst_dims_vec[i + x_axis], static_cast<int>(x_dims[i]));
+  }
+  for (int i = 0; i < y_dims.size(); ++i) {
+    dst_dims_vec[i + y_axis] =
+        std::max(dst_dims_vec[i + y_axis], static_cast<int>(y_dims[i]));
+  }
+
+  auto dst_dims = framework::make_ddim(dst_dims_vec);
+  NpuBroadcast<T>(dev_ctx, x, x_axis, dst_dims, transformed_x);
+  NpuBroadcast<T>(dev_ctx, y, y_axis, dst_dims, transformed_y);
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h b/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h
index 541ff9aacfc462..95dc6ed342ffc3 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h
@@ -163,7 +163,7 @@ struct DimensionsTransform {
 
 struct StridesCalculation {
   std::vector<std::vector<uint32_t>> strides;
-  std::vector<FastDivMod> divmoders;
+  std::vector<platform::FastDivMod> divmoders;
 
  private:
   // To calculate the strides of each input_tensor.
@@ -190,7 +190,7 @@ struct StridesCalculation {
     strides.resize(N, std::vector<uint32_t>(dim_size, 1));
 
     for (int i = 0; i < dim_size; ++i) {
-      divmoders[i] = FastDivMod(out_dims[i]);
+      divmoders[i] = platform::FastDivMod(out_dims[i]);
     }
     CalculateStrides(N, dim_size, in_dims);
   }
@@ -198,21 +198,21 @@ struct StridesCalculation {
 
 template <typename InT, typename OutT, typename Functor, ElementwiseType ET,
           int VecSize, int kDims>
-struct BroadcastArgsWarpper {
-  using InVecType = CudaAlignedVector<InT, VecSize>;
-  using OutVecType = CudaAlignedVector<OutT, VecSize>;
+struct BroadcastArgsWrapper {
+  using InVecType = platform::CudaAlignedVector<InT, VecSize>;
+  using OutVecType = platform::CudaAlignedVector<OutT, VecSize>;
 
   OutT *out_data;
   OutVecType *vec_out_data;
   const InT *__restrict__ in_data[ET];
   const InVecType *__restrict__ vec_in_data[ET];
   bool no_broadcast[ET];
-  FastDivMod divmoders[kDims];
+  platform::FastDivMod divmoders[kDims];
   uint32_t strides[ET][framework::DDim::kMaxRank];
   uint32_t scalar_cal_offset;
   Functor func;
 
-  HOSTDEVICE BroadcastArgsWarpper(
+  HOSTDEVICE BroadcastArgsWrapper(
       const std::vector<const framework::Tensor *> &ins, framework::Tensor *out,
       int scalar_cal_offset, Functor func,
       const StridesCalculation &offset_calculator)
@@ -227,7 +227,7 @@ struct BroadcastArgsWarpper {
     out_data = out->data<OutT>();
     vec_out_data = reinterpret_cast<OutVecType *>(out_data);
     memcpy(divmoders, offset_calculator.divmoders.data(),
-           kDims * sizeof(FastDivMod));
+           kDims * sizeof(platform::FastDivMod));
   }
 
   __device__ __forceinline__ uint32_t GetOffsetByDivmod(int idx, int in_idx) {
@@ -302,30 +302,29 @@ struct BroadcastArgsWarpper {
   }
 };
 
-template <typename InT, typename OutT, typename BroadcastArgsWarpper,
+template <typename InT, typename OutT, typename BroadcastArgsWrapper,
           ElementwiseType ET>
 __device__ inline void ScalarizedBroadcastKernelImpl(
-    BroadcastArgsWarpper broadcast_warpper, int tid) {
+    BroadcastArgsWrapper broadcast_wrapper, int tid) {
   InT args[ET];
   OutT args_out;
-  broadcast_warpper.LoadScalarizedData(args, tid);
+  broadcast_wrapper.LoadScalarizedData(args, tid);
 
-#pragma unroll(ET)
-  for (int j = 1; j < ET; ++j) {
-    args_out = broadcast_warpper.func(args);
-  }
-  broadcast_warpper.StoreScalarizedData(args_out, tid);
+  // Calcualtion of the in_tensor data.
+  args_out = broadcast_wrapper.func(args);
+
+  broadcast_wrapper.StoreScalarizedData(args_out, tid);
 }
 
-template <typename InT, typename OutT, typename BroadcastArgsWarpper,
+template <typename InT, typename OutT, typename BroadcastArgsWrapper,
           ElementwiseType ET, int VecSize>
 __device__ inline void VectorizedBroadcastKernelImpl(
-    BroadcastArgsWarpper broadcast_warpper, int tid) {
-  using OutVecType = CudaAlignedVector<OutT, VecSize>;
+    BroadcastArgsWrapper broadcast_wrapper, int tid) {
+  using OutVecType = platform::CudaAlignedVector<OutT, VecSize>;
   OutVecType args_out;
   InT ins[ET];
   InT args[ET][VecSize];
-  broadcast_warpper.LoadVectorizedData(args, tid);
+  broadcast_wrapper.LoadVectorizedData(args, tid);
 
 #pragma unroll(VecSize)
   for (int i = 0; i < VecSize; ++i) {
@@ -333,30 +332,30 @@ __device__ inline void VectorizedBroadcastKernelImpl(
     for (int j = 0; j < ET; ++j) {
       ins[j] = args[j][i];
     }
-    args_out.val[i] = broadcast_warpper.func(ins);
+    args_out.val[i] = broadcast_wrapper.func(ins);
   }
-  broadcast_warpper.StoreVectorizedData(args_out, tid);
+  broadcast_wrapper.StoreVectorizedData(args_out, tid);
 }
 
-template <typename InT, typename OutT, typename BroadcastArgsWarpper,
+template <typename InT, typename OutT, typename BroadcastArgsWrapper,
           ElementwiseType ET, int VecSize>
 __global__ void ElementwiseBroadcastKernel(
-    BroadcastArgsWarpper broadcast_warpper, int main_tid, int tail_tid) {
+    BroadcastArgsWrapper broadcast_wrapper, int main_tid, int tail_tid) {
   int tid = threadIdx.x + blockIdx.x * blockDim.x;
   // Vectorized calculation of major data whose length is the max multipler of
   // VecSize,
   // eg: Calcualting the front 1024-length data in total 1027 data once VecSize
   // is 4.
   if (tid < main_tid) {
-    VectorizedBroadcastKernelImpl<InT, OutT, BroadcastArgsWarpper, ET, VecSize>(
-        broadcast_warpper, tid);
+    VectorizedBroadcastKernelImpl<InT, OutT, BroadcastArgsWrapper, ET, VecSize>(
+        broadcast_wrapper, tid);
   }
   // Scalarzed calculation of rest data whose lenght cannot fulfill VecSize.
   // eg: Calcualting the rest 3-length data in total 1027 data once VecSize is
   // 4.
   if (tid < tail_tid) {
-    ScalarizedBroadcastKernelImpl<InT, OutT, BroadcastArgsWarpper, ET>(
-        broadcast_warpper, tid);
+    ScalarizedBroadcastKernelImpl<InT, OutT, BroadcastArgsWrapper, ET>(
+        broadcast_wrapper, tid);
   }
 }
 
@@ -367,7 +366,7 @@ void LaunchBroadcastKernelForDifferentDimSize(
     const std::vector<const framework::Tensor *> &ins, framework::Tensor *out,
     int axis, Functor func) {
   int numel = out->numel();
-  const int threads = 256;
+  int threads = GetThreadsConfig(ctx, numel, VecSize);
   int blocks = ((numel + VecSize - 1) / VecSize + threads - 1) / threads;
   int main_tid = numel / VecSize;
   int tail_tid = numel % VecSize;
@@ -380,75 +379,75 @@ void LaunchBroadcastKernelForDifferentDimSize(
 
   switch (merge_dims.dim_size) {
     case 1: {
-      auto broadcast_warpper =
-          BroadcastArgsWarpper<InT, OutT, Functor, ET, VecSize, 1>(
+      auto broadcast_wrapper =
+          BroadcastArgsWrapper<InT, OutT, Functor, ET, VecSize, 1>(
               ins, out, vec_len, func, offset_calculator);
-      ElementwiseBroadcastKernel<InT, OutT, decltype(broadcast_warpper), ET,
+      ElementwiseBroadcastKernel<InT, OutT, decltype(broadcast_wrapper), ET,
                                  VecSize><<<blocks, threads, 0, stream>>>(
-          broadcast_warpper, main_tid, tail_tid);
+          broadcast_wrapper, main_tid, tail_tid);
       break;
     }
     case 2: {
-      auto broadcast_warpper =
-          BroadcastArgsWarpper<InT, OutT, Functor, ET, VecSize, 2>(
+      auto broadcast_wrapper =
+          BroadcastArgsWrapper<InT, OutT, Functor, ET, VecSize, 2>(
               ins, out, vec_len, func, offset_calculator);
-      ElementwiseBroadcastKernel<InT, OutT, decltype(broadcast_warpper), ET,
+      ElementwiseBroadcastKernel<InT, OutT, decltype(broadcast_wrapper), ET,
                                  VecSize><<<blocks, threads, 0, stream>>>(
-          broadcast_warpper, main_tid, tail_tid);
+          broadcast_wrapper, main_tid, tail_tid);
       break;
     }
     case 3: {
-      auto broadcast_warpper =
-          BroadcastArgsWarpper<InT, OutT, Functor, ET, VecSize, 3>(
+      auto broadcast_wrapper =
+          BroadcastArgsWrapper<InT, OutT, Functor, ET, VecSize, 3>(
               ins, out, vec_len, func, offset_calculator);
-      ElementwiseBroadcastKernel<InT, OutT, decltype(broadcast_warpper), ET,
+      ElementwiseBroadcastKernel<InT, OutT, decltype(broadcast_wrapper), ET,
                                  VecSize><<<blocks, threads, 0, stream>>>(
-          broadcast_warpper, main_tid, tail_tid);
+          broadcast_wrapper, main_tid, tail_tid);
       break;
     }
     case 4: {
-      auto broadcast_warpper =
-          BroadcastArgsWarpper<InT, OutT, Functor, ET, VecSize, 4>(
+      auto broadcast_wrapper =
+          BroadcastArgsWrapper<InT, OutT, Functor, ET, VecSize, 4>(
               ins, out, vec_len, func, offset_calculator);
-      ElementwiseBroadcastKernel<InT, OutT, decltype(broadcast_warpper), ET,
+      ElementwiseBroadcastKernel<InT, OutT, decltype(broadcast_wrapper), ET,
                                  VecSize><<<blocks, threads, 0, stream>>>(
-          broadcast_warpper, main_tid, tail_tid);
+          broadcast_wrapper, main_tid, tail_tid);
       break;
     }
     case 5: {
-      auto broadcast_warpper =
-          BroadcastArgsWarpper<InT, OutT, Functor, ET, VecSize, 5>(
+      auto broadcast_wrapper =
+          BroadcastArgsWrapper<InT, OutT, Functor, ET, VecSize, 5>(
               ins, out, vec_len, func, offset_calculator);
-      ElementwiseBroadcastKernel<InT, OutT, decltype(broadcast_warpper), ET,
+      ElementwiseBroadcastKernel<InT, OutT, decltype(broadcast_wrapper), ET,
                                  VecSize><<<blocks, threads, 0, stream>>>(
-          broadcast_warpper, main_tid, tail_tid);
+          broadcast_wrapper, main_tid, tail_tid);
       break;
     }
     case 6: {
-      auto broadcast_warpper =
-          BroadcastArgsWarpper<InT, OutT, Functor, ET, VecSize, 6>(
+      auto broadcast_wrapper =
+          BroadcastArgsWrapper<InT, OutT, Functor, ET, VecSize, 6>(
               ins, out, vec_len, func, offset_calculator);
-      ElementwiseBroadcastKernel<InT, OutT, decltype(broadcast_warpper), ET,
+      ElementwiseBroadcastKernel<InT, OutT, decltype(broadcast_wrapper), ET,
                                  VecSize><<<blocks, threads, 0, stream>>>(
-          broadcast_warpper, main_tid, tail_tid);
+          broadcast_wrapper, main_tid, tail_tid);
       break;
     }
     case 7: {
-      auto broadcast_warpper =
-          BroadcastArgsWarpper<InT, OutT, Functor, ET, VecSize, 7>(
+      auto broadcast_wrapper =
+          BroadcastArgsWrapper<InT, OutT, Functor, ET, VecSize, 7>(
               ins, out, vec_len, func, offset_calculator);
-      ElementwiseBroadcastKernel<InT, OutT, decltype(broadcast_warpper), ET,
+      ElementwiseBroadcastKernel<InT, OutT, decltype(broadcast_wrapper), ET,
                                  VecSize><<<blocks, threads, 0, stream>>>(
-          broadcast_warpper, main_tid, tail_tid);
+          broadcast_wrapper, main_tid, tail_tid);
       break;
     }
     case 8: {
-      auto broadcast_warpper =
-          BroadcastArgsWarpper<InT, OutT, Functor, ET, VecSize, 8>(
+      auto broadcast_wrapper =
+          BroadcastArgsWrapper<InT, OutT, Functor, ET, VecSize, 8>(
               ins, out, vec_len, func, offset_calculator);
-      ElementwiseBroadcastKernel<InT, OutT, decltype(broadcast_warpper), ET,
+      ElementwiseBroadcastKernel<InT, OutT, decltype(broadcast_wrapper), ET,
                                  VecSize><<<blocks, threads, 0, stream>>>(
-          broadcast_warpper, main_tid, tail_tid);
+          broadcast_wrapper, main_tid, tail_tid);
       break;
     }
     default: {
@@ -473,11 +472,11 @@ void LaunchBroadcastElementwiseCudaKernel(
   int in_vec_size = 4;
   framework::Tensor *out = (*outs)[0];
   for (auto *in : ins) {
-    auto temp_size = GetVectorizedSizeImpl<InT>(in->data<InT>());
+    auto temp_size = platform::GetVectorizedSize<InT>(in->data<InT>());
     in_vec_size = in->dims() == out->dims() ? std::min(temp_size, in_vec_size)
                                             : in_vec_size;
   }
-  int out_vec_size = GetVectorizedSizeImpl<OutT>(out->data<OutT>());
+  int out_vec_size = platform::GetVectorizedSize<OutT>(out->data<OutT>());
   int vec_size = std::min(out_vec_size, in_vec_size);
 
   switch (vec_size) {
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
index 101512e35fdcb7..3bd746ace06103 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
@@ -26,7 +26,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-enum ElementwiseType { kUnary = 1, kBinary = 2 };
+enum ElementwiseType { kUnary = 1, kBinary = 2, kTernary = 3 };
 
 /*
 * According to NVIDIA, if number of threads per block is 64/128/256/512,
@@ -52,98 +52,73 @@ inline int GetThreadsConfig(const platform::CUDADeviceContext &ctx,
   return std::max(64, threads);
 }
 
-/*
-* Only the address of input data is the multiplier of 1,2,4, vectorized load
-* with corresponding multiplier-value is possible. Moreover, the maximum length
-* of vectorized load is 128 bits once. Hence, valid length of vectorized load
-* shall be determined under both former constraints.
-*/
-template <typename T>
-int GetVectorizedSizeImpl(const T *pointer) {
-  constexpr int max_load_bits = 128;
-  int valid_vec_size = max_load_bits / CHAR_BIT / sizeof(T);
-  uint64_t address = reinterpret_cast<uint64_t>(pointer);
-  constexpr int vec8 =
-      std::alignment_of<CudaAlignedVector<T, 8>>::value;  // NOLINT
-  constexpr int vec4 =
-      std::alignment_of<CudaAlignedVector<T, 4>>::value;  // NOLINT
-  constexpr int vec2 =
-      std::alignment_of<CudaAlignedVector<T, 2>>::value;  // NOLINT
-  if (address % vec8 == 0) {
-    /*
-    * Currently, decide to deal with no more than 4 data once while adopting
-    * vectorization load/store, if performance test shows that dealing with
-    * 8 data once in vectorization load/store does get optimized, return code
-    * below can be changed into " return std::min(8, valid_vec_size); " .
-    */
-    return std::min(4, valid_vec_size);
-  } else if (address % vec4 == 0) {
-    return std::min(4, valid_vec_size);
-  } else if (address % vec2 == 0) {
-    return std::min(2, valid_vec_size);
-  } else {
-    return 1;
-  }
-}
-
 template <typename InT, typename OutT>
-int GetVectorizedSize(const std::vector<const framework::Tensor *> &ins,
-                      const std::vector<framework::Tensor *> &outs) {
+int GetVectorizedSizeForIO(const std::vector<const framework::Tensor *> &ins,
+                           const std::vector<framework::Tensor *> &outs) {
   int vec_size = 4;
   for (auto iter = ins.begin(); iter != ins.end(); ++iter) {
-    vec_size =
-        std::min<int>(vec_size, GetVectorizedSizeImpl((*iter)->data<InT>()));
+    vec_size = std::min<int>(vec_size,
+                             platform::GetVectorizedSize((*iter)->data<InT>()));
   }
   for (auto iter = outs.begin(); iter != outs.end(); ++iter) {
-    vec_size =
-        std::min<int>(vec_size, GetVectorizedSizeImpl((*iter)->data<OutT>()));
+    vec_size = std::min<int>(
+        vec_size, platform::GetVectorizedSize((*iter)->data<OutT>()));
   }
   return vec_size;
 }
 
 template <ElementwiseType ET, int VecSize, typename InT, typename OutT>
 struct ElementwiseDataWrapper {
-  OutT *out;
-  const InT *in0;
-  const InT *in1;
-  __device__ ElementwiseDataWrapper(OutT *out, const InT *in0,
-                                    const InT *in1 = nullptr)
-      : out(out), in0(in0), in1(in1) {}
-
-  using InVecType = CudaAlignedVector<InT, VecSize>;
-  using OutVecType = CudaAlignedVector<OutT, VecSize>;
-
-  inline __device__ void load_vector(InVecType args[], int idx) {
-    const InVecType *x_vec = reinterpret_cast<const InVecType *>(in0);
-    args[0] = x_vec[idx];
-    if (ET == ElementwiseType::kBinary) {
-      const InVecType *y_vec = reinterpret_cast<const InVecType *>(in1);
-      args[1] = y_vec[idx];
+  using InVecType = platform::CudaAlignedVector<InT, VecSize>;
+  using OutVecType = platform::CudaAlignedVector<OutT, VecSize>;
+
+  const InT *__restrict__ in_data[ET];
+  OutT *out_data;
+  uint32_t scalar_cal_offset;
+
+  HOSTDEVICE ElementwiseDataWrapper(
+      const std::vector<const framework::Tensor *> &ins,
+      std::vector<framework::Tensor *> *outs, uint32_t scalar_cal_offset)
+      : scalar_cal_offset(scalar_cal_offset) {
+#pragma unroll
+    for (int i = 0; i < ET; ++i) {
+      in_data[i] = ins[i]->data<InT>();
+    }
+    out_data = (*outs)[0]->data<OutT>();
+  }
+
+  inline __device__ void LoadVectorizedData(InVecType vec_args[], int tid) {
+#pragma unroll
+    for (int i = 0; i < ET; ++i) {
+      const InVecType *in_vec_data =
+          reinterpret_cast<const InVecType *>(in_data[i]);
+      vec_args[i] = in_vec_data[tid];
     }
   }
 
-  inline __device__ void load_scalar(InT args[], int idx) {
-    args[0] = in0[idx];
-    if (ET == ElementwiseType::kBinary) {
-      args[1] = in1[idx];
+  inline __device__ void LoadScalarizedData(InT args[], int tid) {
+#pragma unroll
+    for (int i = 0; i < ET; ++i) {
+      args[i] = in_data[i][tid + scalar_cal_offset];
     }
   }
 
-  inline __device__ void store_vector(OutVecType res, int idx) {
-    OutVecType *out_vec = reinterpret_cast<OutVecType *>(out);
-    out_vec[idx] = res;
+  inline __device__ void StoreVectorizedData(OutVecType res, int tid) {
+    OutVecType *out_vec = reinterpret_cast<OutVecType *>(out_data);
+    out_vec[tid] = res;
   }
 
-  inline __device__ void store_scalar(OutT res, int idx) { out[idx] = res; }
+  inline __device__ void StoreScalarizedData(OutT res, int tid) {
+    out_data[tid + scalar_cal_offset] = res;
+  }
 };
 
-template <ElementwiseType ET, int VecSize, typename InT, typename OutT,
-          typename Functor>
-__device__ inline void VectorizedKernelImpl(
-    ElementwiseDataWrapper<ET, VecSize, InT, OutT> data, Functor func,
-    int tid) {
-  using InVecType = CudaAlignedVector<InT, VecSize>;
-  using OutVecType = CudaAlignedVector<OutT, VecSize>;
+template <ElementwiseType ET, int VecSize, typename ElementwiseWrapper,
+          typename InT, typename OutT, typename Functor>
+__device__ inline void VectorizedKernelImpl(ElementwiseWrapper data,
+                                            Functor func, int tid) {
+  using InVecType = platform::CudaAlignedVector<InT, VecSize>;
+  using OutVecType = platform::CudaAlignedVector<OutT, VecSize>;
   InVecType ins_vec[ET];
   OutVecType out_vec;
   InT *ins_ptr[ET];
@@ -153,7 +128,7 @@ __device__ inline void VectorizedKernelImpl(
     ins_ptr[i] = reinterpret_cast<InT *>(&(ins_vec[i]));
   }
   // load
-  data.load_vector(ins_vec, tid);
+  data.LoadVectorizedData(ins_vec, tid);
 
 // compute
 #pragma unroll
@@ -165,52 +140,48 @@ __device__ inline void VectorizedKernelImpl(
     out_vec.val[i] = func(ins);
   }
   // store
-  data.store_vector(out_vec, tid);
+  data.StoreVectorizedData(out_vec, tid);
 }
 
-template <ElementwiseType ET, int VecSize, typename InT, typename OutT,
-          typename Functor>
-__device__ inline void ScalarKernelImpl(
-    ElementwiseDataWrapper<ET, VecSize, InT, OutT> data, Functor func,
-    int start, int remain) {
+template <ElementwiseType ET, typename ElementwiseWrapper, typename InT,
+          typename OutT, typename Functor>
+__device__ inline void ScalarKernelImpl(ElementwiseWrapper data, Functor func,
+                                        int tid) {
   InT ins[ET];
   OutT out;
 
-  for (int i = 0; i < remain; ++i) {
-    int idx = start + i;
-    // load
-    data.load_scalar(ins, idx);
-    // compute
-    out = func(ins);
-    // store
-    data.store_scalar(out, idx);
-  }
+  // load
+  data.LoadScalarizedData(ins, tid);
+  // compute
+  out = func(ins);
+  // store
+  data.StoreScalarizedData(out, tid);
 }
 
-template <ElementwiseType ET, int VecSize, typename InT, typename OutT,
-          typename Functor>
-__global__ void VectorizedKernel(const InT *__restrict__ in0,
-                                 const InT *__restrict__ in1, OutT *out,
-                                 int size, Functor func) {
+template <ElementwiseType ET, typename ElementwiseWrapper, typename InT,
+          typename OutT, int VecSize, typename Functor>
+__global__ void VectorizedKernel(ElementwiseWrapper data, int main_tid,
+                                 int tail_tid, Functor func) {
   int tid = blockIdx.x * blockDim.x + threadIdx.x;
-  int remain = size - VecSize * tid;
-  remain = remain > 0 ? remain : 0;
-  auto data = ElementwiseDataWrapper<ET, VecSize, InT, OutT>(out, in0, in1);
-  if (remain >= VecSize) {
-    VectorizedKernelImpl(data, func, tid);
-  } else {
-    ScalarKernelImpl(data, func, tid * VecSize, remain);
+
+  if (tid < main_tid) {
+    VectorizedKernelImpl<ET, VecSize, ElementwiseWrapper, InT, OutT, Functor>(
+        data, func, tid);
+  }
+  if (tid < tail_tid) {
+    ScalarKernelImpl<ET, ElementwiseWrapper, InT, OutT, Functor>(data, func,
+                                                                 tid);
   }
 }
 
-template <ElementwiseType ET, typename InT, typename OutT, typename Functor>
-__global__ void ScalarKernel(const InT *__restrict__ in0,
-                             const InT *__restrict__ in1, OutT *out, int size,
-                             Functor func) {
-  auto data = ElementwiseDataWrapper<ET, 1, InT, OutT>(out, in0, in1);
+template <ElementwiseType ET, typename ElementwiseWrapper, typename InT,
+          typename OutT, typename Functor>
+__global__ void ScalarKernel(ElementwiseWrapper data, int numel, Functor func) {
   int tid = blockIdx.x * blockDim.x + threadIdx.x;
-  int remain = tid < size ? 1 : 0;
-  ScalarKernelImpl(data, func, tid, remain);
+  if (tid < numel) {
+    ScalarKernelImpl<ET, ElementwiseWrapper, InT, OutT, Functor>(data, func,
+                                                                 tid);
+  }
 }
 
 template <ElementwiseType ET, typename InT, typename OutT, typename Functor>
@@ -219,35 +190,48 @@ void LaunchSameDimsElementwiseCudaKernel(
     const std::vector<const framework::Tensor *> &ins,
     std::vector<framework::Tensor *> *outs, Functor func) {
   // calculate the max vec_size for all ins and outs
-  auto size = ins[0]->numel();
-  int vec_size = GetVectorizedSize<InT, OutT>(ins, *outs);
-  int block_size = GetThreadsConfig(ctx, size, vec_size);
+  auto numel = ins[0]->numel();
+  int vec_size = GetVectorizedSizeForIO<InT, OutT>(ins, *outs);
+  int block_size = GetThreadsConfig(ctx, numel, vec_size);
   int grid_size =
-      ((size + vec_size - 1) / vec_size + block_size - 1) / block_size;
-  const InT *in0 = ins[0]->data<InT>();
-  const InT *in1 =
-      (ET == ElementwiseType::kBinary) ? ins[1]->data<InT>() : nullptr;
-  OutT *out = (*outs)[0]->data<OutT>();
+      ((numel + vec_size - 1) / vec_size + block_size - 1) / block_size;
+  int main_tid = numel / vec_size;
+  int tail_tid = numel % vec_size;
+  uint32_t vec_len = main_tid * vec_size;
+
   // cuda kernel
   auto stream = ctx.stream();
 
   switch (vec_size) {
-    case 4:
-      VectorizedKernel<ET, 4><<<grid_size, block_size, 0, stream>>>(
-          in0, in1, out, size, func);
+    case 4: {
+      auto data_wrapper =
+          ElementwiseDataWrapper<ET, 4, InT, OutT>(ins, outs, vec_len);
+      VectorizedKernel<ET, decltype(data_wrapper), InT, OutT,
+                       4><<<grid_size, block_size, 0, stream>>>(
+          data_wrapper, main_tid, tail_tid, func);
       break;
-    case 2:
-      VectorizedKernel<ET, 2><<<grid_size, block_size, 0, stream>>>(
-          in0, in1, out, size, func);
+    }
+    case 2: {
+      auto data_wrapper =
+          ElementwiseDataWrapper<ET, 2, InT, OutT>(ins, outs, vec_len);
+      VectorizedKernel<ET, decltype(data_wrapper), InT, OutT,
+                       2><<<grid_size, block_size, 0, stream>>>(
+          data_wrapper, main_tid, tail_tid, func);
       break;
-    case 1:
-      ScalarKernel<ET><<<grid_size, block_size, 0, stream>>>(in0, in1, out,
-                                                             size, func);
+    }
+    case 1: {
+      auto data_wrapper =
+          ElementwiseDataWrapper<ET, 1, InT, OutT>(ins, outs, 0);
+      ScalarKernel<ET, decltype(data_wrapper), InT,
+                   OutT><<<grid_size, block_size, 0, stream>>>(data_wrapper,
+                                                               numel, func);
       break;
-    default:
+    }
+    default: {
       PADDLE_THROW(platform::errors::Unimplemented(
           "Unsupported vectorized size: %d !", vec_size));
       break;
+    }
   }
 }
 
diff --git a/paddle/fluid/operators/expand_as_op.h b/paddle/fluid/operators/expand_as_op.h
old mode 100755
new mode 100644
index 406455af741715..07ba0e5ad87133
--- a/paddle/fluid/operators/expand_as_op.h
+++ b/paddle/fluid/operators/expand_as_op.h
@@ -13,42 +13,12 @@ limitations under the License. */
 
 #include <vector>
 
-#include <boost/preprocessor/arithmetic/div.hpp>
-#include <boost/preprocessor/arithmetic/mod.hpp>
-#include <boost/preprocessor/comparison/greater.hpp>
-#include <boost/preprocessor/comparison/greater_equal.hpp>
-#include <boost/preprocessor/control/if.hpp>
-#include <boost/preprocessor/repetition/repeat.hpp>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/operators/eigen/eigen_function.h"
 
 #define MAX_RANK_SUPPORTED 6
-// 1. BOOST_PP_REPEAT macro represents a fast horizontal repetition construct.
-//    Usage: BOOST_PP_REPEAT(count, macro, data).
-//    This macro expands to the sequence:
-//    macro(z, 0, data) macro(z, 1, data) ... macro(z, count - 1, data).
-// 2. As for our case, count = MAX_RANK_SUPPORTED(which is 6).
-//    So the range of n is 0-5(which is count-1).
-//    We want to generate case 1-6 instead of case 0-5.
-//    So we need to change n to n + 1.
-#define EXPAND_AS_TEMPLATE(z, n, data) \
-  case n + 1: {                        \
-    ExpandAs<n + 1>(context);          \
-    break;                             \
-  }
-#define REP_EXPAND_AS_TEMPLATE(n) BOOST_PP_REPEAT(n, EXPAND_AS_TEMPLATE, ~)
-#define COND(n) BOOST_PP_GREATER_EQUAL(n, BOOST_PP_MOD(n, MAX_RANK_SUPPORTED))
-#define EXPAND_AS_GRAD_CASE(n)                                           \
-  case n + 1: {                                                          \
-    ExpandAsBackward<n + 1>(context, reshape_dims_vec, reduce_dims_vec); \
-    break;                                                               \
-  }
-#define EXPAND_AS_GRAD_TEMPLATE(z, n, data) \
-  BOOST_PP_IF(COND(n), EXPAND_AS_GRAD_CASE(n), )
-#define REP_EXPAND_AS_GRAD_TEMPLATE(n) \
-  BOOST_PP_REPEAT(n, EXPAND_AS_GRAD_TEMPLATE, ~)
 
 namespace paddle {
 namespace operators {
@@ -67,7 +37,24 @@ class ExpandAsKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& context) const override {
     auto rank = context.Input<Tensor>("X")->dims().size();
     switch (rank) {
-      REP_EXPAND_AS_TEMPLATE(MAX_RANK_SUPPORTED)
+      case 1:
+        ExpandAs<1>(context);
+        break;
+      case 2:
+        ExpandAs<2>(context);
+        break;
+      case 3:
+        ExpandAs<3>(context);
+        break;
+      case 4:
+        ExpandAs<4>(context);
+        break;
+      case 5:
+        ExpandAs<5>(context);
+        break;
+      case 6:
+        ExpandAs<6>(context);
+        break;
       default:
         PADDLE_THROW(platform::errors::InvalidArgument(
             "Only support tensor with rank being between 1 and 6. But received "
@@ -165,7 +152,24 @@ class ExpandAsGradKernel : public framework::OpKernel<T> {
                             "to %d, but the value received is %d.",
                             MAX_RANK_SUPPORTED, dims));
       switch (dims) {
-        REP_EXPAND_AS_GRAD_TEMPLATE(MAX_RANK_SUPPORTED)
+        case 1:
+          ExpandAsBackward<1>(context, reshape_dims_vec, reduce_dims_vec);
+          break;
+        case 2:
+          ExpandAsBackward<2>(context, reshape_dims_vec, reduce_dims_vec);
+          break;
+        case 3:
+          ExpandAsBackward<3>(context, reshape_dims_vec, reduce_dims_vec);
+          break;
+        case 4:
+          ExpandAsBackward<4>(context, reshape_dims_vec, reduce_dims_vec);
+          break;
+        case 5:
+          ExpandAsBackward<5>(context, reshape_dims_vec, reduce_dims_vec);
+          break;
+        case 6:
+          ExpandAsBackward<6>(context, reshape_dims_vec, reduce_dims_vec);
+          break;
         default:
           PADDLE_THROW(platform::errors::InvalidArgument(
               "Only support tensor with rank being between 1 and 6. But "
diff --git a/paddle/fluid/operators/expand_as_v2_op.h b/paddle/fluid/operators/expand_as_v2_op.h
old mode 100755
new mode 100644
index 6df4c592378cb2..3e8f7d15880bcd
--- a/paddle/fluid/operators/expand_as_v2_op.h
+++ b/paddle/fluid/operators/expand_as_v2_op.h
@@ -14,42 +14,12 @@ limitations under the License. */
 #include <algorithm>
 #include <vector>
 
-#include <boost/preprocessor/arithmetic/div.hpp>
-#include <boost/preprocessor/arithmetic/mod.hpp>
-#include <boost/preprocessor/comparison/greater.hpp>
-#include <boost/preprocessor/comparison/greater_equal.hpp>
-#include <boost/preprocessor/control/if.hpp>
-#include <boost/preprocessor/repetition/repeat.hpp>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/operators/eigen/eigen_function.h"
 
 #define MAX_RANK_SUPPORTED 6
-// 1. BOOST_PP_REPEAT macro represents a fast horizontal repetition construct.
-//    Usage: BOOST_PP_REPEAT(count, macro, data).
-//    This macro expands to the sequence:
-//    macro(z, 0, data) macro(z, 1, data) ... macro(z, count - 1, data).
-// 2. As for our case, count = MAX_RANK_SUPPORTED(which is 6).
-//    So the range of n is 0-5(which is count-1).
-//    We want to generate case 1-6 instead of case 0-5.
-//    So we need to change n to n + 1.
-#define EXPAND_AS_TEMPLATE(z, n, data) \
-  case n + 1: {                        \
-    ExpandAs<n + 1>(context);          \
-    break;                             \
-  }
-#define REP_EXPAND_AS_TEMPLATE(n) BOOST_PP_REPEAT(n, EXPAND_AS_TEMPLATE, ~)
-#define COND(n) BOOST_PP_GREATER_EQUAL(n, BOOST_PP_MOD(n, MAX_RANK_SUPPORTED))
-#define EXPAND_AS_GRAD_CASE(n)                                           \
-  case n + 1: {                                                          \
-    ExpandAsBackward<n + 1>(context, reshape_dims_vec, reduce_dims_vec); \
-    break;                                                               \
-  }
-#define EXPAND_AS_GRAD_TEMPLATE(z, n, data) \
-  BOOST_PP_IF(COND(n), EXPAND_AS_GRAD_CASE(n), )
-#define REP_EXPAND_AS_GRAD_TEMPLATE(n) \
-  BOOST_PP_REPEAT(n, EXPAND_AS_GRAD_TEMPLATE, ~)
 
 namespace paddle {
 namespace operators {
@@ -85,7 +55,26 @@ class ExpandAsV2Kernel : public framework::OpKernel<T> {
                           "expand_as_v2 op must be less than or equal to %d.",
                           target_rank, MAX_RANK_SUPPORTED));
 
-    switch (target_rank) { REP_EXPAND_AS_TEMPLATE(MAX_RANK_SUPPORTED) }
+    switch (target_rank) {
+      case 1:
+        ExpandAs<1>(context);
+        break;
+      case 2:
+        ExpandAs<2>(context);
+        break;
+      case 3:
+        ExpandAs<3>(context);
+        break;
+      case 4:
+        ExpandAs<4>(context);
+        break;
+      case 5:
+        ExpandAs<5>(context);
+        break;
+      case 6:
+        ExpandAs<6>(context);
+        break;
+    }
   }
 
  protected:
@@ -186,7 +175,24 @@ class ExpandAsV2GradKernel : public framework::OpKernel<T> {
                             "to %d, but the value received is %d.",
                             MAX_RANK_SUPPORTED, dims));
       switch (dims) {
-        REP_EXPAND_AS_GRAD_TEMPLATE(MAX_RANK_SUPPORTED)
+        case 1:
+          ExpandAsBackward<1>(context, reshape_dims_vec, reduce_dims_vec);
+          break;
+        case 2:
+          ExpandAsBackward<2>(context, reshape_dims_vec, reduce_dims_vec);
+          break;
+        case 3:
+          ExpandAsBackward<3>(context, reshape_dims_vec, reduce_dims_vec);
+          break;
+        case 4:
+          ExpandAsBackward<4>(context, reshape_dims_vec, reduce_dims_vec);
+          break;
+        case 5:
+          ExpandAsBackward<5>(context, reshape_dims_vec, reduce_dims_vec);
+          break;
+        case 6:
+          ExpandAsBackward<6>(context, reshape_dims_vec, reduce_dims_vec);
+          break;
         default:
           PADDLE_THROW(platform::errors::InvalidArgument(
               "Only support tensor with rank being between 1 and 6. But "
diff --git a/paddle/fluid/operators/expand_as_v2_op_npu.cc b/paddle/fluid/operators/expand_as_v2_op_npu.cc
new file mode 100644
index 00000000000000..76cb12330b5cd3
--- /dev/null
+++ b/paddle/fluid/operators/expand_as_v2_op_npu.cc
@@ -0,0 +1,96 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/expand_as_v2_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class ExpandAsV2NPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto rank = context.Input<Tensor>("X")->dims().size();
+    auto target_shape = context.Attr<std::vector<int>>("target_shape");
+    auto target_rank = target_shape.size();
+    PADDLE_ENFORCE_GE(target_rank, rank,
+                      platform::errors::InvalidArgument(
+                          "The rank (%d) of the input 'target_tensor' for "
+                          "expand_as_v2 op must be greater than or equal to "
+                          "the rank (%d) of the input 'x'.",
+                          target_rank, rank));
+    PADDLE_ENFORCE_GE(rank, 1, platform::errors::InvalidArgument(
+                                   "The rank (%d) of the input 'x' for "
+                                   "expand_as_v2 op must be positive.",
+                                   rank));
+    PADDLE_ENFORCE_LE(target_rank, MAX_RANK_SUPPORTED,
+                      platform::errors::InvalidArgument(
+                          "The rank (%d) of the input 'target_tensor' for "
+                          "expand_as_v2 op must be less than or equal to %d.",
+                          target_rank, MAX_RANK_SUPPORTED));
+    ExpandAs(context);
+  }
+
+ protected:
+  void ExpandAs(const framework::ExecutionContext& context) const {
+    auto* in0 = context.Input<framework::Tensor>("X");
+    auto in_dims = in0->dims();
+    auto target_shape = context.Attr<std::vector<int>>("target_shape");
+    auto vec_in_dims = framework::vectorize<int>(in_dims);
+    auto diff = target_shape.size() - vec_in_dims.size();
+    vec_in_dims.insert(vec_in_dims.begin(), diff, 1);
+
+    for (size_t i = 0; i < vec_in_dims.size(); ++i) {
+      PADDLE_ENFORCE_NE(target_shape[i], 0,
+                        platform::errors::InvalidArgument(
+                            "The value of target shape cannot be zero."));
+      if (vec_in_dims[i] != 1) {
+        PADDLE_ENFORCE_EQ(
+            vec_in_dims[i], target_shape[i],
+            platform::errors::InvalidArgument(
+                "The value (%d) of the non-singleton dimension does not match"
+                " the corresponding value (%d) in "
+                "target tensor for expand_as_v2 op.",
+                vec_in_dims[i], target_shape[i]));
+      }
+    }
+    auto* out0 = context.Output<framework::Tensor>("Out");
+
+    framework::DDim out_dims = framework::make_ddim(target_shape);
+
+    out0->Resize(out_dims);
+    out0->mutable_data<T>(context.GetPlace());
+
+    const auto& runner =
+        NpuOpRunner("ExpandD", {*in0}, {*out0}, {{"shape", target_shape}});
+
+    auto stream =
+        context.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    runner.Run(stream);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_NPU_KERNEL(
+    expand_as_v2,
+    ops::ExpandAsV2NPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::ExpandAsV2NPUKernel<paddle::platform::NPUDeviceContext, int>,
+    ops::ExpandAsV2NPUKernel<paddle::platform::NPUDeviceContext, int8_t>,
+    ops::ExpandAsV2NPUKernel<paddle::platform::NPUDeviceContext, uint8_t>,
+    ops::ExpandAsV2NPUKernel<paddle::platform::NPUDeviceContext,
+                             paddle::platform::float16>);
diff --git a/paddle/fluid/operators/expand_op.h b/paddle/fluid/operators/expand_op.h
old mode 100755
new mode 100644
index e566d69096595c..809bad1d6c1eec
--- a/paddle/fluid/operators/expand_op.h
+++ b/paddle/fluid/operators/expand_op.h
@@ -16,41 +16,12 @@ limitations under the License. */
 
 #include <vector>
 
-#include <boost/preprocessor/arithmetic/div.hpp>
-#include <boost/preprocessor/arithmetic/mod.hpp>
-#include <boost/preprocessor/comparison/greater.hpp>
-#include <boost/preprocessor/comparison/greater_equal.hpp>
-#include <boost/preprocessor/control/if.hpp>
-#include <boost/preprocessor/repetition/repeat.hpp>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/operators/eigen/eigen_function.h"
 
 #define MAX_RANK_SUPPORTED 6
-// 1. BOOST_PP_REPEAT macro represents a fast horizontal repetition construct.
-//    Usage: BOOST_PP_REPEAT(count, macro, data).
-//    This macro expands to the sequence:
-//    macro(z, 0, data) macro(z, 1, data) ... macro(z, count - 1, data).
-// 2. As for our case, count = MAX_RANK_SUPPORTED(which is 6).
-//    So the range of n is 0-5(which is count-1).
-//    We want to generate case 1-6 instead of case 0-5.
-//    So we need to change n to n + 1.
-#define EXPAND_TEMPLATE(z, n, data) \
-  case n + 1: {                     \
-    Expand<n + 1>(context);         \
-    break;                          \
-  }
-#define REP_EXPAND_TEMPLATE(n) BOOST_PP_REPEAT(n, EXPAND_TEMPLATE, ~)
-#define COND(n) BOOST_PP_GREATER_EQUAL(n, BOOST_PP_MOD(n, MAX_RANK_SUPPORTED))
-#define EXPAND_GRAD_CASE(n)                                            \
-  case n + 1: {                                                        \
-    ExpandBackward<n + 1>(context, reshape_dims_vec, reduce_dims_vec); \
-    break;                                                             \
-  }
-#define EXPAND_GRAD_TEMPLATE(z, n, data) \
-  BOOST_PP_IF(COND(n), EXPAND_GRAD_CASE(n), )
-#define REP_EXPAND_GRAD_TEMPLATE(n) BOOST_PP_REPEAT(n, EXPAND_GRAD_TEMPLATE, ~)
 
 namespace paddle {
 namespace operators {
@@ -137,7 +108,26 @@ class ExpandKernel : public framework::OpKernel<T> {
             "The number of dimensions of the input 'x' for Op(expand) "
             "must be less than or equal to %d, but the value received is %d.",
             MAX_RANK_SUPPORTED, rank));
-    switch (rank) { REP_EXPAND_TEMPLATE(MAX_RANK_SUPPORTED) }
+    switch (rank) {
+      case 1:
+        Expand<1>(context);
+        break;
+      case 2:
+        Expand<2>(context);
+        break;
+      case 3:
+        Expand<3>(context);
+        break;
+      case 4:
+        Expand<4>(context);
+        break;
+      case 5:
+        Expand<5>(context);
+        break;
+      case 6:
+        Expand<6>(context);
+        break;
+    }
   }
 
  protected:
@@ -233,7 +223,24 @@ class ExpandGradKernel : public framework::OpKernel<T> {
                             "to %d, but the value received is %d.",
                             MAX_RANK_SUPPORTED, dims));
       switch (dims) {
-        REP_EXPAND_GRAD_TEMPLATE(MAX_RANK_SUPPORTED)
+        case 1:
+          ExpandBackward<1>(context, reshape_dims_vec, reduce_dims_vec);
+          break;
+        case 2:
+          ExpandBackward<2>(context, reshape_dims_vec, reduce_dims_vec);
+          break;
+        case 3:
+          ExpandBackward<3>(context, reshape_dims_vec, reduce_dims_vec);
+          break;
+        case 4:
+          ExpandBackward<4>(context, reshape_dims_vec, reduce_dims_vec);
+          break;
+        case 5:
+          ExpandBackward<5>(context, reshape_dims_vec, reduce_dims_vec);
+          break;
+        case 6:
+          ExpandBackward<6>(context, reshape_dims_vec, reduce_dims_vec);
+          break;
         default:
           PADDLE_THROW(platform::errors::InvalidArgument(
               "Only support tensor with rank being between 1 and 6. But "
diff --git a/paddle/fluid/operators/expand_op_npu.cc b/paddle/fluid/operators/expand_op_npu.cc
index 76d5a203f306b9..2f66316c483a9c 100644
--- a/paddle/fluid/operators/expand_op_npu.cc
+++ b/paddle/fluid/operators/expand_op_npu.cc
@@ -39,7 +39,26 @@ class ExpandNPUKernel : public framework::OpKernel<T> {
             "The number of dimensions of the input 'x' for Op(expand) "
             "must be less than or equal to %d, but the value received is %d.",
             MAX_RANK_SUPPORTED, rank));
-    switch (rank) { REP_EXPAND_TEMPLATE(MAX_RANK_SUPPORTED) }
+    switch (rank) {
+      case 1:
+        Expand<1>(context);
+        break;
+      case 2:
+        Expand<2>(context);
+        break;
+      case 3:
+        Expand<3>(context);
+        break;
+      case 4:
+        Expand<4>(context);
+        break;
+      case 5:
+        Expand<5>(context);
+        break;
+      case 6:
+        Expand<6>(context);
+        break;
+    }
   }
 
  protected:
diff --git a/paddle/fluid/operators/expand_v2_op.h b/paddle/fluid/operators/expand_v2_op.h
old mode 100755
new mode 100644
index 8a87a067c51f11..a720bd7b551823
--- a/paddle/fluid/operators/expand_v2_op.h
+++ b/paddle/fluid/operators/expand_v2_op.h
@@ -17,41 +17,12 @@ limitations under the License. */
 #include <algorithm>
 #include <vector>
 
-#include <boost/preprocessor/arithmetic/div.hpp>
-#include <boost/preprocessor/arithmetic/mod.hpp>
-#include <boost/preprocessor/comparison/greater.hpp>
-#include <boost/preprocessor/comparison/greater_equal.hpp>
-#include <boost/preprocessor/control/if.hpp>
-#include <boost/preprocessor/repetition/repeat.hpp>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/operators/eigen/eigen_function.h"
 
 #define MAX_RANK_SUPPORTED 6
-// 1. BOOST_PP_REPEAT macro represents a fast horizontal repetition construct.
-//    Usage: BOOST_PP_REPEAT(count, macro, data).
-//    This macro expands to the sequence:
-//    macro(z, 0, data) macro(z, 1, data) ... macro(z, count - 1, data).
-// 2. As for our case, count = MAX_RANK_SUPPORTED(which is 6).
-//    So the range of n is 0-5(which is count-1).
-//    We want to generate case 1-6 instead of case 0-5.
-//    So we need to change n to n + 1.
-#define EXPAND_TEMPLATE(z, n, data) \
-  case n + 1: {                     \
-    Expand<n + 1>(context);         \
-    break;                          \
-  }
-#define REP_EXPAND_TEMPLATE(n) BOOST_PP_REPEAT(n, EXPAND_TEMPLATE, ~)
-#define COND(n) BOOST_PP_GREATER_EQUAL(n, BOOST_PP_MOD(n, MAX_RANK_SUPPORTED))
-#define EXPAND_GRAD_CASE(n)                                            \
-  case n + 1: {                                                        \
-    ExpandBackward<n + 1>(context, reshape_dims_vec, reduce_dims_vec); \
-    break;                                                             \
-  }
-#define EXPAND_GRAD_TEMPLATE(z, n, data) \
-  BOOST_PP_IF(COND(n), EXPAND_GRAD_CASE(n), )
-#define REP_EXPAND_GRAD_TEMPLATE(n) BOOST_PP_REPEAT(n, EXPAND_GRAD_TEMPLATE, ~)
 
 namespace paddle {
 namespace operators {
@@ -132,7 +103,26 @@ class ExpandV2Kernel : public framework::OpKernel<T> {
             "less than or equal to %d.",
             shape_size, MAX_RANK_SUPPORTED));
     rank = std::max(rank, static_cast<int>(shape_size));
-    switch (rank) { REP_EXPAND_TEMPLATE(MAX_RANK_SUPPORTED) }
+    switch (rank) {
+      case 1:
+        Expand<1>(context);
+        break;
+      case 2:
+        Expand<2>(context);
+        break;
+      case 3:
+        Expand<3>(context);
+        break;
+      case 4:
+        Expand<4>(context);
+        break;
+      case 5:
+        Expand<5>(context);
+        break;
+      case 6:
+        Expand<6>(context);
+        break;
+    }
   }
 
  protected:
@@ -271,7 +261,24 @@ class ExpandV2GradKernel : public framework::OpKernel<T> {
                             "to %d, but the value received is %d.",
                             MAX_RANK_SUPPORTED, dims));
       switch (dims) {
-        REP_EXPAND_GRAD_TEMPLATE(MAX_RANK_SUPPORTED)
+        case 1:
+          ExpandBackward<1>(context, reshape_dims_vec, reduce_dims_vec);
+          break;
+        case 2:
+          ExpandBackward<2>(context, reshape_dims_vec, reduce_dims_vec);
+          break;
+        case 3:
+          ExpandBackward<3>(context, reshape_dims_vec, reduce_dims_vec);
+          break;
+        case 4:
+          ExpandBackward<4>(context, reshape_dims_vec, reduce_dims_vec);
+          break;
+        case 5:
+          ExpandBackward<5>(context, reshape_dims_vec, reduce_dims_vec);
+          break;
+        case 6:
+          ExpandBackward<6>(context, reshape_dims_vec, reduce_dims_vec);
+          break;
         default:
           PADDLE_THROW(platform::errors::InvalidArgument(
               "Only support tensor with rank being between 1 and 6. But "
diff --git a/paddle/fluid/operators/eye_op_npu.cc b/paddle/fluid/operators/eye_op_npu.cc
new file mode 100644
index 00000000000000..c23f24b78441f1
--- /dev/null
+++ b/paddle/fluid/operators/eye_op_npu.cc
@@ -0,0 +1,59 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/eye_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class EyeNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto num_rows = ctx.Attr<int64_t>("num_rows");
+
+    auto d_nums = ctx.Attr<int>("dtype");
+    auto dtype =
+        ConvertToNpuDtype(static_cast<framework::proto::VarType::Type>(d_nums));
+
+    auto num_columns = ctx.Attr<int64_t>("num_columns");
+    if (num_columns == -1) num_columns = num_rows;
+
+    framework::NPUAttributeMap attr_input = {
+        {"num_rows", num_rows}, {"num_columns", num_columns}, {"dtype", dtype}};
+
+    auto* out = ctx.Output<framework::Tensor>("Out");
+    out->mutable_data<T>(ctx.GetPlace());
+
+    const auto& runner = NpuOpRunner("Eye", {}, {*out}, attr_input);
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    runner.Run(stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    eye, ops::EyeNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::EyeNPUKernel<paddle::platform::NPUDeviceContext, int>,
+    ops::EyeNPUKernel<paddle::platform::NPUDeviceContext,
+                      paddle::platform::float16>);
diff --git a/paddle/fluid/operators/fill_any_like_op.h b/paddle/fluid/operators/fill_any_like_op.h
index 9c514ed3aaa38f..2fb7bf985f222a 100644
--- a/paddle/fluid/operators/fill_any_like_op.h
+++ b/paddle/fluid/operators/fill_any_like_op.h
@@ -45,15 +45,18 @@ class FillAnyLikeKernel : public framework::OpKernel<T> {
          static_cast<CommonType>(std::numeric_limits<T>::lowest())) &&
             (common_type_value <=
              static_cast<CommonType>(std::numeric_limits<T>::max())),
-        true, platform::errors::InvalidArgument(
-                  "filled value is out of range for"
-                  " targeted type in fill_any_like, your kernel type is %s"
-                  ", please check value you set.",
-                  typeid(T).name()));
+        true,
+        platform::errors::InvalidArgument(
+            "The filled value is out of range for target type, "
+            "current kernel type is %s, the range should between %f "
+            "and %f, but now value is %f.",
+            typeid(T).name(),
+            static_cast<CommonType>(std::numeric_limits<T>::lowest()),
+            static_cast<CommonType>(std::numeric_limits<T>::max()), value));
+
     PADDLE_ENFORCE_EQ(
         std::isnan(value), false,
-        platform::errors::InvalidArgument("filled value should not be NaN,"
-                                          " but received NaN"));
+        platform::errors::InvalidArgument("The filled value is NaN."));
 
     math::SetConstant<DeviceContext, T> setter;
     setter(context.template device_context<DeviceContext>(), out,
diff --git a/paddle/fluid/operators/fill_any_like_op_npu.cc b/paddle/fluid/operators/fill_any_like_op_npu.cc
new file mode 100644
index 00000000000000..d5204f5cacfc68
--- /dev/null
+++ b/paddle/fluid/operators/fill_any_like_op_npu.cc
@@ -0,0 +1,79 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/fill_any_like_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class FillAnyLikeNPUKernel : public framework::OpKernel<T> {
+ public:
+  using CommonType = typename std::common_type<
+      float,
+      typename std::conditional<std::is_same<T, platform::float16>::value,
+                                float, T>::type>::type;
+
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto data_type = static_cast<framework::proto::VarType::Type>(
+        context.Attr<int>("dtype"));
+    auto* out = context.Output<framework::Tensor>("Out");
+    out->mutable_data<T>(context.GetPlace());
+
+    float value = context.Attr<float>("value");
+
+    auto common_type_value = static_cast<CommonType>(value);
+
+    PADDLE_ENFORCE_EQ(
+        (common_type_value >=
+         static_cast<CommonType>(std::numeric_limits<T>::lowest())) &&
+            (common_type_value <=
+             static_cast<CommonType>(std::numeric_limits<T>::max())),
+        true,
+        platform::errors::InvalidArgument(
+            "The filled value is out of range for target type, "
+            "current kernel type is %s, the range should between %f "
+            "and %f, but now value is %f.",
+            typeid(T).name(),
+            static_cast<CommonType>(std::numeric_limits<T>::lowest()),
+            static_cast<CommonType>(std::numeric_limits<T>::max()), value));
+
+    PADDLE_ENFORCE_EQ(
+        std::isnan(value), false,
+        platform::errors::InvalidArgument("The filled value is NaN."));
+
+    Tensor tensor_tmp(data_type);
+    tensor_tmp.mutable_data<T>({1}, context.GetPlace());
+    FillNpuTensorWithConstant<T>(&tensor_tmp, static_cast<T>(value));
+
+    auto stream =
+        context.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    auto shape = out->dims();
+    const auto& runner = NpuOpRunner("FillD", {tensor_tmp}, {*out},
+                                     {{"dims", framework::vectorize(shape)}});
+    runner.Run(stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(fill_any_like, ops::FillAnyLikeNPUKernel<int>,
+                       ops::FillAnyLikeNPUKernel<float>,
+                       ops::FillAnyLikeNPUKernel<paddle::platform::float16>);
diff --git a/paddle/fluid/operators/fill_constant_batch_size_like_op_npu.cc b/paddle/fluid/operators/fill_constant_batch_size_like_op_npu.cc
new file mode 100644
index 00000000000000..7edddce65cc6f5
--- /dev/null
+++ b/paddle/fluid/operators/fill_constant_batch_size_like_op_npu.cc
@@ -0,0 +1,97 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/fill_constant_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+#include "paddle/fluid/operators/utils.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class FillConstantBatchSizeLikeOpNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto data_type =
+        static_cast<framework::proto::VarType::Type>(ctx.Attr<int>("dtype"));
+    auto float_value = ctx.Attr<float>("value");
+    auto str_value = ctx.Attr<std::string>("str_value");
+    auto force_cpu = ctx.Attr<bool>("force_cpu");
+
+    auto *out = ctx.Output<Tensor>("Out");
+    auto *input = ctx.Input<Tensor>("Input");
+    if (&ctx.Attr<int>("input_dim_idx") == 0) {
+      // set the correct batch size.
+      auto odims = out->dims();
+      int input_dim_idx = ctx.Attr<int>("input_dim_idx");
+      int output_dim_idx = ctx.Attr<int>("output_dim_idx");
+      odims[output_dim_idx] = input->dims()[input_dim_idx];
+      out->mutable_data<T>(odims, ctx.GetPlace());
+    }
+
+    T value;
+    if (str_value.empty()) {
+      value = static_cast<T>(float_value);
+    } else {
+      std::stringstream convert_stream(str_value);
+      if (std::is_same<int64_t, T>::value) {
+        int64_t tmp_value;
+        convert_stream >> tmp_value;
+        value = static_cast<T>(tmp_value);
+      } else {
+        double tmp_value;
+        convert_stream >> tmp_value;
+        value = static_cast<T>(tmp_value);
+      }
+    }
+
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(ctx.GetPlace());
+    bool cpu_place = force_cpu || ctx.GetPlace() == platform::CPUPlace();
+    if (cpu_place) {
+      math::SetConstant<platform::CPUDeviceContext, T> functor;
+      out->mutable_data(platform::CPUPlace(), data_type);
+      functor(reinterpret_cast<const platform::CPUDeviceContext &>(dev_ctx),
+              out, static_cast<T>(value));
+    } else {
+      out->mutable_data(ctx.GetPlace(), data_type);
+      Tensor tensor_tmp(data_type);
+      tensor_tmp.mutable_data<T>({1}, ctx.GetPlace());
+      FillNpuTensorWithConstant<T>(&tensor_tmp, value);
+
+      auto stream =
+          ctx.template device_context<paddle::platform::NPUDeviceContext>()
+              .stream();
+      const auto &runner =
+          NpuOpRunner("FillD", {tensor_tmp}, {*out},
+                      {{"dims", framework::vectorize(out->dims())}});
+      runner.Run(stream);
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    fill_constant_batch_size_like,
+    ops::FillConstantBatchSizeLikeOpNPUKernel<
+        paddle::platform::NPUDeviceContext, float>,
+    ops::FillConstantBatchSizeLikeOpNPUKernel<
+        paddle::platform::NPUDeviceContext, int>,
+    ops::FillConstantBatchSizeLikeOpNPUKernel<
+        paddle::platform::NPUDeviceContext, paddle::platform::float16>);
diff --git a/paddle/fluid/operators/fill_constant_op.cc b/paddle/fluid/operators/fill_constant_op.cc
index d465e77ea1886f..0dcbb6e727de78 100644
--- a/paddle/fluid/operators/fill_constant_op.cc
+++ b/paddle/fluid/operators/fill_constant_op.cc
@@ -36,7 +36,6 @@ class FillConstantOp : public framework::OperatorWithKernel {
                 i, shape[i], framework::make_ddim(shape)));
       }
     }
-
     if (shape.empty() && ctx->HasInput("ShapeTensor")) {
       auto shape_dims = ctx->GetInputDim("ShapeTensor");
       int num_ele = 1;
diff --git a/paddle/fluid/operators/flatten_op_npu.cc b/paddle/fluid/operators/flatten_op_npu.cc
new file mode 100644
index 00000000000000..1569760fe3b96f
--- /dev/null
+++ b/paddle/fluid/operators/flatten_op_npu.cc
@@ -0,0 +1,112 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/operators/flatten_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class Flatten2NPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *in = context.Input<framework::LoDTensor>("X");
+    auto *out = context.Output<framework::LoDTensor>("Out");
+    auto &axis = context.Attr<int>("axis");
+    out->mutable_data(context.GetPlace(), in->type());
+    framework::NPUAttributeMap attr_input = {{"axis", axis}};
+
+    auto stream =
+        context.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    const auto &runner = NpuOpRunner("FlattenV2", {*in}, {*out}, attr_input);
+    runner.Run(stream);
+  }
+};
+
+template <typename T>
+class Flatten2GradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto *d_x = ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
+    auto *d_out =
+        ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"));
+
+    auto xshape_dims = ctx.Input<framework::LoDTensor>("XShape")->dims();
+    auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size());
+
+    d_x->mutable_data(ctx.GetPlace(), d_out->type());
+    framework::TensorCopy(
+        *d_out, ctx.GetPlace(),
+        ctx.template device_context<paddle::platform::NPUDeviceContext>(), d_x);
+    d_x->Resize(x_dims);
+  }
+};
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class FlattenContiguousRangeNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto *X = ctx.Input<Tensor>("X");
+    auto *Out = ctx.Output<Tensor>("Out");
+    int start_axis = ctx.Attr<int>("start_axis");
+    int stop_axis = ctx.Attr<int>("stop_axis");
+
+    Out->mutable_data<T>(ctx.GetPlace());
+
+    const auto &runner =
+        NpuOpRunner("FlattenV2", {*X}, {*Out},
+                    {{"axis", static_cast<int32_t>(start_axis)},
+                     {"end_axis", static_cast<int32_t>(stop_axis)}});
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    runner.Run(stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(flatten2, ops::Flatten2NPUKernel<float>,
+                       ops::Flatten2NPUKernel<double>,
+                       ops::Flatten2NPUKernel<uint8_t>,
+                       ops::Flatten2NPUKernel<int>,
+                       ops::Flatten2NPUKernel<int8_t>,
+                       ops::Flatten2NPUKernel<int64_t>);
+REGISTER_OP_NPU_KERNEL(flatten2_grad, ops::Flatten2GradNPUKernel<float>,
+                       ops::Flatten2GradNPUKernel<double>,
+                       ops::Flatten2GradNPUKernel<uint8_t>,
+                       ops::Flatten2GradNPUKernel<int>,
+                       ops::Flatten2GradNPUKernel<int8_t>,
+                       ops::Flatten2GradNPUKernel<int64_t>);
+
+REGISTER_OP_NPU_KERNEL(
+    flatten_contiguous_range,
+    ops::FlattenContiguousRangeNPUKernel<paddle::platform::NPUDeviceContext,
+                                         float>,
+    ops::FlattenContiguousRangeNPUKernel<paddle::platform::NPUDeviceContext,
+                                         double>,
+    ops::FlattenContiguousRangeNPUKernel<paddle::platform::NPUDeviceContext,
+                                         uint8_t>,
+    ops::FlattenContiguousRangeNPUKernel<paddle::platform::NPUDeviceContext,
+                                         int>,
+    ops::FlattenContiguousRangeNPUKernel<paddle::platform::NPUDeviceContext,
+                                         int8_t>,
+    ops::FlattenContiguousRangeNPUKernel<paddle::platform::NPUDeviceContext,
+                                         int64_t>);
diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.cc b/paddle/fluid/operators/hierarchical_sigmoid_op.cc
index 05d521be5a1064..b9fbd18cf146c8 100644
--- a/paddle/fluid/operators/hierarchical_sigmoid_op.cc
+++ b/paddle/fluid/operators/hierarchical_sigmoid_op.cc
@@ -71,8 +71,17 @@ class HierarchicalSigmoidOp : public framework::OperatorWithKernel {
     if (with_prefetch) {
       OP_INOUT_CHECK(ctx->HasOutput("W_Out"), "Output", "W_Out", "hsigmoid");
     }
-    const int64_t batch_size = ctx->GetInputDim("X")[0];
-    std::vector<int64_t> output_shape({batch_size, 1});
+    const int64_t input_dims = ctx->GetInputDim("X")[0];
+    const int64_t label_dims = ctx->GetInputDim("Label")[0];
+    PADDLE_ENFORCE_EQ(input_dims, label_dims,
+                      platform::errors::InvalidArgument(
+                          "The first dimension of "
+                          "input and label is expected to be the same. "
+                          "But received input's first dimension is %d; "
+                          "label's first dimension is %d.",
+                          input_dims, label_dims));
+
+    std::vector<int64_t> output_shape({input_dims, 1});
     ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
     ctx->ShareLoD("X", /*->*/ "Out");
   }
diff --git a/paddle/fluid/operators/index_select_op_npu.cc b/paddle/fluid/operators/index_select_op_npu.cc
new file mode 100644
index 00000000000000..8df6c4e5d9ea72
--- /dev/null
+++ b/paddle/fluid/operators/index_select_op_npu.cc
@@ -0,0 +1,57 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/index_select_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class IndexSelectNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto *x = ctx.Input<Tensor>("X");
+    auto *index = ctx.Input<Tensor>("Index");
+    auto dim = ctx.Attr<int>("dim");
+
+    auto *out = ctx.Output<Tensor>("Out");
+    out->mutable_data<T>(ctx.GetPlace());
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    NpuOpRunner runner;
+    runner.SetType("GatherV2")
+        .AddInput(*x)
+        .AddInput(*index)
+        .AddInput(std::vector<int32_t>{dim})
+        .AddOutput(*out);
+    runner.Run(stream);
+  }
+};
+
+// todo: add class 'IndexSelectGradNPUKernel' here.
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_NPU_KERNEL(
+    index_select,
+    ops::IndexSelectNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::IndexSelectNPUKernel<paddle::platform::NPUDeviceContext, int>,
+    ops::IndexSelectNPUKernel<paddle::platform::NPUDeviceContext, int64_t>);
+// todo: register npu index_select_grad kernel here.
diff --git a/paddle/fluid/operators/kernel_primitives/compute_primitives.h b/paddle/fluid/operators/kernel_primitives/compute_primitives.h
index 1d23cfe007558f..ccd301aa8ca3d4 100644
--- a/paddle/fluid/operators/kernel_primitives/compute_primitives.h
+++ b/paddle/fluid/operators/kernel_primitives/compute_primitives.h
@@ -14,8 +14,140 @@
 
 #pragma once
 
+#ifdef PADDLE_WITH_CUDA
+#include <cuda_fp16.h>
+#endif
+#ifdef PADDLE_WITH_HIP
+#include <hip/hip_fp16.h>
+#endif
+
+#include <algorithm>
+#include "paddle/fluid/platform/float16.h"
+
 namespace paddle {
 namespace operators {
-namespace kernel_primitives {}
+namespace kernel_primitives {
+namespace details {
+
+template <typename T>
+class MPTypeTrait {
+ public:
+  using Type = T;
+};
+
+template <>
+class MPTypeTrait<platform::float16> {
+ public:
+  using Type = float;
+};
+
+}  // namespace details
+
+/*************************** Compute Functor****************************/
+template <typename T, typename Enable = void>
+struct DivFunctor {
+  inline HOSTDEVICE T operator()(const T* args) const {
+    return args[0] / args[1];
+  }
+};
+
+template <typename T>
+struct DivFunctor<T, typename std::enable_if_t<std::is_integral<T>::value>> {
+  inline HOSTDEVICE T operator()(const T* args) const {
+    PADDLE_ENFORCE(args[1] != 0,
+                   platform::errors::InvalidArgument(
+                       "Invalid Argument Error: Integer division by zero "
+                       "encountered in divide. Please check the input value."));
+    return args[0] / args[1];
+  }
+};
+
+/*************************** Compute Function****************************/
+
+/**
+ * @brief compute functor for elementwise_two, in1 and in2 has the same shape
+ * @param：
+ * T : the type of in1 and in2
+ * NX: the row of in1 and in2
+ * NY: the col of in1 and in2
+ * BlockSize: the strid of col
+ * OpFunc: compute functor eg: ADD, SUB, XOR, OR, MUL
+ */
+template <typename T, typename OutT, int NX, int NY, int BlockSize,
+          class OpFunc>
+__device__ __forceinline__ void ElementwiseBinary(OutT* out, const T* in1,
+                                                  const T* in2,
+                                                  OpFunc compute) {
+  T args[2];
+#pragma unroll
+  for (int idx = 0; idx < NX * NY; ++idx) {
+    args[0] = in1[idx];
+    args[1] = in2[idx];
+    out[idx] = static_cast<OutT>(compute(args));
+  }
+}
+
+/**
+ * @brief fma eg: a * b + c, in1 in2, in3 and out has the same shape
+ * @param：
+ * T : the type of in1 and in2, in3
+ * NX: the row of in1, in2 and in3
+ * NY: the col of in1, in2 and in3
+ * BlockSize: the strid of col
+ */
+template <typename T, typename OutT, int NX, int NY, int BlockSize,
+          class OpFunc>
+__device__ __forceinline__ void ElementwiseFma(OutT* out, const T* in1,
+                                               const T* in2, const T* in3,
+                                               OpFunc compute) {
+#pragma unroll
+  for (int idx = 0; idx < NX * NY; ++idx) {
+    out[idx] = static_cast<OutT>(compute(in1[idx], in2[idx], in3[idx]));
+  }
 }
+
+/**
+ * @brief compute functor for elementwise_two, in1 is [1, NY], in2 is [NX, NY]
+ * @param：
+ * T : the type of in1 and in2
+ * NX: the row of in1 and in2
+ * NY: the col of in2
+ * BlockSize: the strid of col
+ * OpFunc: compute functor eg: ADD, SUB, XOR, OR, MUL
+ */
+template <typename T, typename OutT, int NX, int NY, int BlockSize,
+          class OpFunc>
+__device__ __forceinline__ void CycleBinary(OutT* out, const T* in1,
+                                            const T* in2, OpFunc compute) {
+#pragma unroll
+  for (int idx = 0; idx < NX; idx++) {
+#pragma unroll
+    for (int idy = 0; idy < NY; idy++) {
+      out[idx + idy * NX] =
+          static_cast<OutT>(compute(in1[idx], in2[idx + idy * NX]));
+    }
+  }
 }
+
+/**
+ * @brief compute functor for unary, in1 is [NX, NY]
+ * @param：
+ * T : the type of in
+ * NX: the row of in
+ * NY: the col of in
+ * BlockSize: the strid of col
+ * OpFunc: compute functor eg: relu, sigmoid, exp
+ */
+template <typename T, typename OutT, int NX, int NY, int BlockSize,
+          class OpFunc>
+__device__ __forceinline__ void ElementwiseUnary(OutT* out, const T* in,
+                                                 OpFunc compute) {
+#pragma unroll
+  for (int idx = 0; idx < NX * NY; idx++) {
+    out[idx] = static_cast<OutT>(compute(in + idx));
+  }
+}
+
+}  // namespace kernel_primitives
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/kernel_primitives/datamover_primitives.h b/paddle/fluid/operators/kernel_primitives/datamover_primitives.h
index 1d23cfe007558f..d520c33ca9bccf 100644
--- a/paddle/fluid/operators/kernel_primitives/datamover_primitives.h
+++ b/paddle/fluid/operators/kernel_primitives/datamover_primitives.h
@@ -13,9 +13,205 @@
 // limitations under the License.
 
 #pragma once
+#include <cuda.h>
+#include <cuda_fp16.h>
+#include <math.h>
+#include <iostream>
+#include <vector>
 
 namespace paddle {
 namespace operators {
-namespace kernel_primitives {}
+namespace kernel_primitives {
+namespace details {
+
+#define INT_BITS 32
+
+template <typename T, int VecSize>
+struct alignas(sizeof(T) * VecSize) VectorType {
+  T val[VecSize];
+};
+
+struct FastDivMod {
+  // 1st value represents the result of input number divides by recorded divisor
+  // 2nd value represents the result of input number modulo by recorded divisor
+  using DivModT = VectorType<uint32_t, 2>;
+
+  FastDivMod() {}
+  HOSTDEVICE FastDivMod(uint32_t d) : divisor(d) {
+    static_assert(sizeof(unsigned int) == 4,
+                  "Only Support 32-bit unsigned int.");
+
+    for (shift_val = 0; shift_val < INT_BITS; ++shift_val) {
+      auto shift_limit = 1 << shift_val;
+      if (shift_limit >= divisor) break;
+    }
+    uint64_t long_one = 1;
+    uint64_t temp_div =
+        ((long_one << INT_BITS) * ((long_one << shift_val) - divisor)) /
+            divisor +
+        1;
+    multiplier = temp_div;
+  }
+
+  __device__ __forceinline__ uint32_t Div(uint32_t n) const {
+    uint32_t t = __umulhi(n, multiplier);
+    return (t + n) >> shift_val;
+  }
+
+  __device__ __forceinline__ DivModT Divmod(uint32_t n) const {
+    uint32_t q = Div(n);
+    DivModT result = {q, n - q * divisor};
+    return result;
+  }
+
+  int32_t divisor;
+  int32_t shift_val;
+  uint32_t multiplier;
+};
+
+template <int kDims>
+struct BroadcastConfig {
+  FastDivMod divmoders[kDims];
+  uint32_t strides[framework::DDim::kMaxRank];
+  HOSTDEVICE BroadcastConfig() {}
+
+  HOSTDEVICE BroadcastConfig(const std::vector<int64_t>& out_dims,
+                             const std::vector<int64_t>& in_dims,
+                             int dim_size) {
+    std::vector<uint32_t> strides_in;
+    std::vector<FastDivMod> divmoders_in;
+    // for divmoders
+    divmoders_in.resize(dim_size);
+    for (int i = 0; i < dim_size; ++i) {
+      divmoders_in[i] = FastDivMod(out_dims[i]);
+    }
+    // for strides
+    strides_in.resize(dim_size, 1);
+    for (int i = 0; i < dim_size; ++i) {
+      strides_in[i] = in_dims[i] == 1 ? 0 : strides_in[i];
+      strides_in[i] =
+          (i != 0 && strides_in[i] != 0)
+              ? std::accumulate(in_dims.begin(), in_dims.begin() + i, 1,
+                                std::multiplies<int64_t>())
+              : strides_in[i];
+    }
+
+    memcpy(strides, strides_in.data(), kDims * sizeof(uint32_t));
+    memcpy(divmoders, divmoders_in.data(), kDims * sizeof(FastDivMod));
+  }
+};
+
+#undef INT_BITS
+}  // namespace details
+
+template <typename T, int NX, int NY, int BlockSize>
+__device__ __forceinline__ void ReadDataBase(T* dst, const T* __restrict__ src,
+                                             int size) {
+  int dx = threadIdx.x * NX;
+#pragma unroll
+  for (int idx = 0; idx < NX; ++idx) {
+    if ((idx + dx) >= size) {
+      break;
+    }
+    dst[idx] = src[idx + dx];
+  }
+}
+
+template <typename T, int NX, int NY, int BlockSize>
+__device__ __forceinline__ void ReadData(T* dst, const T* __restrict__ src,
+                                         int size) {
+  const int VECTOR_SIZE = (NX % 4 == 0) ? 4 : (NX % 2 == 0) ? 2 : 1;
+  const int VECTORS_PER_THREAD = NX / VECTOR_SIZE;
+
+  // Vector per thread
+  if (blockDim.x * NX > size) {
+    ReadDataBase<T, NX, NY, BlockSize>(dst, src, size);
+  } else {
+    // Vector type
+    using VecType = details::VectorType<T, VECTOR_SIZE>;
+    VecType vec_temp[VECTORS_PER_THREAD];
+    const VecType* vec_input = reinterpret_cast<const VecType*>(src);
+    ReadDataBase<VecType, VECTORS_PER_THREAD, NY, BlockSize>(
+        vec_temp, vec_input, VECTORS_PER_THREAD * blockDim.x);
+#pragma unroll
+    for (int idx = 0; idx < NX; ++idx) {
+      dst[idx] = *(reinterpret_cast<T*>(vec_temp) + idx);
+    }
+  }
+}
+
+/** @brief: ReadDataBc
+ * read data from src ptr when the shape of src and dst are different
+ * @param：
+ * src: the source pointer
+ * dst: the dst pointer
+ * stride_nx: the stride of src
+ * stride_ny: the stride of src
+ * the shape of dst is [NY, NX]
+ */
+template <typename T, int NX, int NY, int BlockSize, int ShapeSize>
+__device__ __forceinline__ void ReadDataBc(
+    T* dst, const T* __restrict__ src, uint32_t fix,
+    details::BroadcastConfig<ShapeSize> config, int num, int stride_nx,
+    int stride_ny) {
+  uint32_t base_offset = fix + threadIdx.x * NX;
+  uint32_t offset = 0;
+
+#pragma unroll
+  for (int ny = 0; ny < NY; ++ny) {
+#pragma unroll
+    for (uint32_t nx = 0; nx < NX; ++nx) {
+      uint32_t idx = base_offset + ny * stride_ny + nx * stride_nx;
+      if (idx < num) {
+        offset = 0;
+#pragma unroll
+        for (int i = 0; i < ShapeSize; ++i) {
+          auto fast_divmoder = config.divmoders[i].Divmod(idx);
+          idx = fast_divmoder.val[0];
+          offset += fast_divmoder.val[1] * config.strides[i];
+        }
+        dst[nx + ny * NX] = src[offset];
+      }
+    }
+  }
+}
+
+template <typename T, int NX, int NY, int BlockSize>
+__device__ __forceinline__ void WriteDataBase(T* dst, const T* __restrict__ src,
+                                              int size) {
+  int dx = threadIdx.x * NX;
+#pragma unroll
+  for (int idx = 0; idx < NX; ++idx) {
+    if ((idx + dx) >= size) {
+      break;
+    }
+    dst[idx + dx] = src[idx];
+  }
 }
+
+template <typename T, int NX, int NY, int BlockSize>
+__device__ __forceinline__ void WriteData(T* dst, T* __restrict__ src,
+                                          int size) {
+  const int VECTOR_SIZE = (NX % 4 == 0) ? 4 : (NX % 2 == 0) ? 2 : 1;
+  const int VECTORS_PER_THREAD = NX / VECTOR_SIZE;
+
+  // Vector per thread
+  if (blockDim.x * NX > size) {
+    WriteDataBase<T, NX, NY, BlockSize>(dst, src, size);
+  } else {
+    // Vector type
+    using VecType = details::VectorType<T, VECTOR_SIZE>;
+    VecType vec_temp[VECTORS_PER_THREAD];
+#pragma unroll
+    for (int idx = 0; idx < VECTORS_PER_THREAD; ++idx) {
+      vec_temp[idx] = *(reinterpret_cast<VecType*>(src) + idx);
+    }
+    VecType* vec_dst = reinterpret_cast<VecType*>(dst);
+    WriteDataBase<VecType, VECTORS_PER_THREAD, NY, BlockSize>(
+        vec_dst, vec_temp, VECTORS_PER_THREAD * blockDim.x);
+  }
 }
+
+}  // namespace kernel_primitives
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/log_softmax_op.h b/paddle/fluid/operators/log_softmax_op.h
index c732ec5a2da0ab..162087a75662d7 100644
--- a/paddle/fluid/operators/log_softmax_op.h
+++ b/paddle/fluid/operators/log_softmax_op.h
@@ -131,8 +131,10 @@ class LogSoftmaxKernel : public framework::OpKernel<T> {
     // allocate memory on device.
     Out->mutable_data<T>(context.GetPlace());
 
-    LogSoftmaxFunctor<DeviceContext, T>()(
-        context.template device_context<DeviceContext>(), X, Out, axis);
+    if (X->numel() != 0) {
+      LogSoftmaxFunctor<DeviceContext, T>()(
+          context.template device_context<DeviceContext>(), X, Out, axis);
+    }
   }
 };
 
@@ -183,8 +185,11 @@ class LogSoftmaxGradKernel : public framework::OpKernel<T> {
     // allocate memory on device.
     dX->mutable_data<T>(context.GetPlace());
 
-    LogSoftmaxGradFunctor<DeviceContext, T>()(
-        context.template device_context<DeviceContext>(), Out, dOut, dX, axis);
+    if (Out->numel() != 0) {
+      LogSoftmaxGradFunctor<DeviceContext, T>()(
+          context.template device_context<DeviceContext>(), Out, dOut, dX,
+          axis);
+    }
   }
 };
 
diff --git a/paddle/fluid/operators/math/concat_and_split.cc b/paddle/fluid/operators/math/concat_and_split.cc
index 6c1ee863737011..83b4e89fe046f4 100644
--- a/paddle/fluid/operators/math/concat_and_split.cc
+++ b/paddle/fluid/operators/math/concat_and_split.cc
@@ -83,6 +83,12 @@ class SplitFunctor<platform::CPUDeviceContext, T> {
                   const framework::Tensor& input,
                   const std::vector<const framework::Tensor*>& ref_inputs,
                   const int axis, std::vector<framework::Tensor*>* outputs) {
+    // NOTE(zhiqiu): split a tensor of shape [0,3,4] at axis=1, result in 3
+    // tensors of shape [0,1,4]
+    if (input.numel() == 0) {
+      return;
+    }
+
     // TODO(zcd): Add input data validity checking
     size_t num = outputs->size();
 
diff --git a/paddle/fluid/operators/math/concat_and_split.cu b/paddle/fluid/operators/math/concat_and_split.cu
index f9cce061383939..b9481f1c8e40e2 100644
--- a/paddle/fluid/operators/math/concat_and_split.cu
+++ b/paddle/fluid/operators/math/concat_and_split.cu
@@ -352,6 +352,12 @@ class SplitFunctor<platform::CUDADeviceContext, T> {
                   const framework::Tensor& input,
                   const std::vector<const framework::Tensor*>& ref_inputs,
                   int axis, std::vector<framework::Tensor*>* outputs) {
+    // NOTE(zhiqiu): split a tensor of shape [0,3,4] at axis=1, result in 3
+    // tensors of shape [0,1,4]
+    if (input.numel() == 0) {
+      return;
+    }
+
     // TODO(zcd): Add input data validity checking
     int o_num = outputs->size();
     int64_t out_row = 1;
diff --git a/paddle/fluid/operators/meshgrid_op.h b/paddle/fluid/operators/meshgrid_op.h
old mode 100755
new mode 100644
index 2aad894e11d4b4..e01469f26d74fa
--- a/paddle/fluid/operators/meshgrid_op.h
+++ b/paddle/fluid/operators/meshgrid_op.h
@@ -16,12 +16,6 @@
 
 #include <vector>
 
-#include <boost/preprocessor/arithmetic/mod.hpp>
-#include <boost/preprocessor/comparison/greater.hpp>
-#include <boost/preprocessor/comparison/greater_equal.hpp>
-#include <boost/preprocessor/control/if.hpp>
-#include <boost/preprocessor/repetition/repeat.hpp>
-
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
@@ -29,31 +23,6 @@
 #include "paddle/fluid/platform/errors.h"
 
 #define MAX_RANK_SUPPORTED 6
-// 1. BOOST_PP_REPEAT macro represents a fast horizontal repetition construct.
-//    Usage: BOOST_PP_REPEAT(count, macro, data).
-//    This macro expands to the sequence:
-//    macro(z, 0, data) macro(z, 1, data) ... macro(z, count - 1, data).
-// 2. As for our case, count = MAX_RANK_SUPPORTED(which is 6).
-//    So the range of n is 0-5(which is count-1).
-//    We want to generate case 1-6 instead of case 0-5.
-//    So we need to change n to n + 1.
-#define MESHGRID_TEMPLATE(z, n, data) \
-  case n + 1: {                       \
-    MeshgridForward<n + 1>(context);  \
-    break;                            \
-  }
-#define REP_MESHGRID_TEMPLATE(n) BOOST_PP_REPEAT(n, MESHGRID_TEMPLATE, ~)
-#define COND(n) BOOST_PP_GREATER_EQUAL(n, BOOST_PP_MOD(n, MAX_RANK_SUPPORTED))
-
-#define MESHGRID_GRAD_CASE(n)         \
-  case n + 1: {                       \
-    MeshgridBackward<n + 1>(context); \
-    break;                            \
-  }
-#define MESHGRID_GRAD_TEMPLATE(z, n, data) \
-  BOOST_PP_IF(COND(n), MESHGRID_GRAD_CASE(n), )
-#define REP_MESHGRID_GRAD_TEMPLATE(n) \
-  BOOST_PP_REPEAT(n, MESHGRID_GRAD_TEMPLATE, ~)
 
 namespace paddle {
 namespace operators {
@@ -65,7 +34,24 @@ class MeshgridKernel : public framework::OpKernel<T> {
     auto ins = context.MultiInput<framework::Tensor>("X");
     auto rank = ins.size();
     switch (rank) {
-      REP_MESHGRID_TEMPLATE(MAX_RANK_SUPPORTED)
+      case 1:
+        MeshgridForward<1>(context);
+        break;
+      case 2:
+        MeshgridForward<2>(context);
+        break;
+      case 3:
+        MeshgridForward<3>(context);
+        break;
+      case 4:
+        MeshgridForward<4>(context);
+        break;
+      case 5:
+        MeshgridForward<5>(context);
+        break;
+      case 6:
+        MeshgridForward<6>(context);
+        break;
       default:
         PADDLE_THROW(platform::errors::InvalidArgument(
             "Excepted Tensor numbers between 1 and 6, but only received d% .",
@@ -141,7 +127,24 @@ class MeshgridGradKernel : public framework::OpKernel<T> {
         context.MultiInput<framework::Tensor>(framework::GradVarName("Out"));
     int n = out_grad.size();
     switch (n) {
-      REP_MESHGRID_GRAD_TEMPLATE(MAX_RANK_SUPPORTED)
+      case 1:
+        MeshgridBackward<1>(context);
+        break;
+      case 2:
+        MeshgridBackward<2>(context);
+        break;
+      case 3:
+        MeshgridBackward<3>(context);
+        break;
+      case 4:
+        MeshgridBackward<4>(context);
+        break;
+      case 5:
+        MeshgridBackward<5>(context);
+        break;
+      case 6:
+        MeshgridBackward<6>(context);
+        break;
       default:
         PADDLE_THROW(platform::errors::InvalidArgument(
             "Excepted Tensor numbers between 1 and 6, but only received d% .",
diff --git a/paddle/fluid/operators/npu_op_runner.cc b/paddle/fluid/operators/npu_op_runner.cc
index 4461941e85c2a5..bb6549c111988e 100644
--- a/paddle/fluid/operators/npu_op_runner.cc
+++ b/paddle/fluid/operators/npu_op_runner.cc
@@ -33,6 +33,7 @@ static std::map<framework::proto::VarType::Type, aclDataType>
     DTYPE_2_ACL_DTYPE = {
         {framework::proto::VarType::BOOL, ACL_BOOL},
         {framework::proto::VarType::UINT8, ACL_UINT8},
+        {framework::proto::VarType::INT8, ACL_INT8},
         {framework::proto::VarType::INT16, ACL_INT16},
         {framework::proto::VarType::INT32, ACL_INT32},
         {framework::proto::VarType::INT64, ACL_INT64},
@@ -240,6 +241,38 @@ NpuOpRunner &NpuOpRunner::AddInput(std::vector<int64_t> &&dims) {
   return *this;
 }
 
+NpuOpRunner &NpuOpRunner::AddInput(std::vector<float> &&values) {
+  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+  auto *dev_ctx =
+      static_cast<platform::CPUDeviceContext *>(pool.Get(platform::CPUPlace()));
+  Tensor host_tensor;
+  TensorFromVector(values, *dev_ctx, &host_tensor);
+  host_tensors_.emplace_back(host_tensor);
+
+  // create aclTensorDesc
+  input_descs_.emplace_back(CreateTensorDesc(host_tensor, ACL_MEMTYPE_HOST));
+  // create aclDataBuffer
+  input_buffers_.emplace_back(CreateDataBuffer(host_tensor));
+
+  return *this;
+}
+
+NpuOpRunner &NpuOpRunner::AddInput(std::vector<double> &&values) {
+  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+  auto *dev_ctx =
+      static_cast<platform::CPUDeviceContext *>(pool.Get(platform::CPUPlace()));
+  Tensor host_tensor;
+  TensorFromVector(values, *dev_ctx, &host_tensor);
+  host_tensors_.emplace_back(host_tensor);
+
+  // create aclTensorDesc
+  input_descs_.emplace_back(CreateTensorDesc(host_tensor, ACL_MEMTYPE_HOST));
+  // create aclDataBuffer
+  input_buffers_.emplace_back(CreateDataBuffer(host_tensor));
+
+  return *this;
+}
+
 NpuOpRunner &NpuOpRunner::AddOutput(const Tensor &tensor) {
   // create aclTensorDesc
   output_descs_.emplace_back(CreateTensorDesc(tensor));
diff --git a/paddle/fluid/operators/npu_op_runner.h b/paddle/fluid/operators/npu_op_runner.h
index 2257c209550d60..45e973970a956d 100644
--- a/paddle/fluid/operators/npu_op_runner.h
+++ b/paddle/fluid/operators/npu_op_runner.h
@@ -71,6 +71,10 @@ class NpuOpRunner {
 
   NpuOpRunner &AddInput(std::vector<int64_t> &&dims);
 
+  NpuOpRunner &AddInput(std::vector<float> &&values);
+
+  NpuOpRunner &AddInput(std::vector<double> &&values);
+
   NpuOpRunner &AddOutput(const Tensor &tensor);
 
   NpuOpRunner &AddInputs(const std::vector<Tensor> &tensors);
diff --git a/paddle/fluid/operators/one_hot_op_npu.cc b/paddle/fluid/operators/one_hot_op_npu.cc
new file mode 100644
index 00000000000000..1cf99d844c8887
--- /dev/null
+++ b/paddle/fluid/operators/one_hot_op_npu.cc
@@ -0,0 +1,82 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/one_hot_op.h"
+
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+
+template <typename T>
+class OneHotNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& dev_ctx =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>();
+    auto* in = ctx.Input<LoDTensor>("X");
+    auto* out = ctx.Output<LoDTensor>("Out");
+    int depth = ctx.Attr<int>("depth");
+
+    if (ctx.HasInput("depth_tensor")) {
+      auto* depth_tensor = ctx.Input<Tensor>("depth_tensor");
+      std::vector<int32_t> depth_data;
+      framework::TensorToVector(*depth_tensor, dev_ctx, &depth_data);
+      depth = depth_data[0];
+      auto in_dims = in->dims();
+      framework::DDim out_dims(in_dims);
+      out_dims[out_dims.size() - 1] = depth;
+      out->Resize(out_dims);
+    }
+    out->mutable_data<float>(ctx.GetPlace());
+
+    float on_value = 1.0f, off_value = 0.0f;
+    if (in->type() == framework::proto::VarType::INT32) {
+      NpuOpRunner runner;
+      runner.SetType("OneHot")
+          .AddInput(*in)
+          .AddInput(std::vector<int32_t>({static_cast<int32_t>(depth)}))
+          .AddInput(std::vector<float>({on_value}))
+          .AddInput(std::vector<float>({off_value}))
+          .AddAttr("axis", -1)
+          .AddOutput(*out);
+      runner.Run(dev_ctx.stream());
+    } else {
+      Tensor transformed_in;
+      transformed_in.mutable_data<int32_t>(in->dims(), dev_ctx.GetPlace());
+      const auto& cast_runner = NpuOpRunner("Cast", {*in}, {transformed_in},
+                                            {{"dst_type", ACL_INT32}});
+      cast_runner.Run(dev_ctx.stream());
+      NpuOpRunner runner;
+      runner.SetType("OneHot")
+          .AddInput(transformed_in)
+          .AddInput(std::vector<int32_t>({static_cast<int32_t>(depth)}))
+          .AddInput(std::vector<float>({on_value}))
+          .AddInput(std::vector<float>({off_value}))
+          .AddAttr("axis", -1)
+          .AddOutput(*out);
+      runner.Run(dev_ctx.stream());
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(one_hot, ops::OneHotNPUKernel<int32_t>,
+                       ops::OneHotNPUKernel<int64_t>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_all_op.cu b/paddle/fluid/operators/reduce_ops/reduce_all_op.cu
index 99a5caaad6ab80..674326f90c504d 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_all_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_all_op.cu
@@ -15,7 +15,6 @@
 #include "paddle/fluid/operators/reduce_ops/reduce_all_op.h"
 #include "paddle/fluid/operators/reduce_ops/reduce_functor_op.h"
 
-// reduce_prod
 REGISTER_OP_CUDA_KERNEL(
     reduce_all,
     ops::ReduceCudaKernel<bool, paddle::operators::CustomLogicalAnd>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op.cu b/paddle/fluid/operators/reduce_ops/reduce_any_op.cu
index c7eafa2ac8760a..b7b0eb598249b1 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_any_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_any_op.cu
@@ -16,7 +16,6 @@
 #include "paddle/fluid/operators/reduce_ops/reduce_functor_op.h"
 #include "paddle/fluid/operators/reduce_ops/reduce_op.h"
 
-// reduce_prod
 REGISTER_OP_CUDA_KERNEL(
     reduce_any,
     ops::ReduceCudaKernel<bool, paddle::operators::CustomLogicalOr>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cu b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cu
index 50d2fcdee23bd9..b5d5bb33d0a880 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cu
@@ -13,58 +13,11 @@
 // limitations under the License.
 
 #include <vector>
-#include "paddle/fluid/operators/reduce_ops/cub_reduce.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_functor_op.h"
 #include "paddle/fluid/operators/reduce_ops/reduce_mean_op.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
 
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct DivideFunctor {
-  HOSTDEVICE explicit inline DivideFunctor(int n) : n_inv((T)(1.0 / n)) {}
-
-  HOSTDEVICE inline T operator()(const T& x) const { return x * n_inv; }
-
- private:
-  T n_inv;
-};
-
-template <typename T>
-class ReduceMeanKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    bool reduce_all = context.Attr<bool>("reduce_all");
-    auto* input = context.Input<Tensor>("X");
-    auto* output = context.Output<Tensor>("Out");
-
-    auto dims = context.Attr<std::vector<int>>("dim");
-    bool keep_dim = context.Attr<bool>("keep_dim");
-
-    std::vector<int> reduce_dims;
-    if (reduce_all) {
-      reduce_dims.resize(input->dims().size());
-      for (int i = 0; i < reduce_dims.size(); ++i) reduce_dims[i] = i;
-    } else {
-      for (auto e : dims) {
-        reduce_dims.push_back(e >= 0 ? e : e + input->dims().size());
-      }
-    }
-
-    int reduce_num = 1;
-    for (int i = 0; i < reduce_dims.size(); ++i) {
-      reduce_num *= input->dims()[reduce_dims[i]];
-    }
-
-    auto stream = context.cuda_device_context().stream();
-    TensorReduce<T, T, cub::Sum, DivideFunctor<T>>(
-        *input, output, reduce_dims, static_cast<T>(0), cub::Sum(),
-        DivideFunctor<T>(reduce_num), stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OP_CUDA_KERNEL(reduce_mean, ops::ReduceMeanKernel<bool>,
-                        ops::ReduceMeanKernel<float>,
-                        ops::ReduceMeanKernel<double>);
+REGISTER_OP_CUDA_KERNEL(
+    reduce_mean, ops::ReduceCudaKernel<bool, paddle::operators::CustomMean>,
+    ops::ReduceCudaKernel<float, paddle::operators::CustomMean>,
+    ops::ReduceCudaKernel<double, paddle::operators::CustomMean>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
index fd329acaf5ff21..fe77d3158ed27c 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
@@ -33,6 +33,7 @@ namespace cub = hipcub;
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/platform/cuda_device_function.h"
 #include "paddle/fluid/platform/fast_divmod.h"
 
@@ -145,7 +146,6 @@ using Tensor = framework::Tensor;
 constexpr int kMaxRank = framework::DDim::kMaxRank;
 
 enum ReduceType {
-  kReduceAll = 0x00,        // when reduce_rank == x_rank
   kReduceLastDim = 0x01,    // when reduce_dim[0] == x_dim.size() - 1;
   kReduceHigherDim = 0x02,  // ReduceFirstDim or reduceSecondDim
   kReduceAny = 0x03,        // when reduce_dim.size() > 1
@@ -158,12 +158,13 @@ struct IndexCalculator {
       : dim(dim) {
     dims = detail::VectorToArray<int, kMaxRank>(cal_dims);
     strides = detail::VectorToArray<int, kMaxRank>(full_strides);
-    std::vector<FastDivMod> cal_divmoders;
+    std::vector<platform::FastDivMod> cal_divmoders;
     // fast divmod
     for (auto i : cal_strides) {
-      cal_divmoders.push_back(FastDivMod(i));
+      cal_divmoders.push_back(platform::FastDivMod(i));
     }
-    divmoders = detail::VectorToArray<FastDivMod, kMaxRank>(cal_divmoders);
+    divmoders =
+        detail::VectorToArray<platform::FastDivMod, kMaxRank>(cal_divmoders);
   }
 
   __device__ inline int Get(int offset) const {
@@ -183,7 +184,7 @@ struct IndexCalculator {
   int dim;
   framework::Array<int, kMaxRank> dims;
   framework::Array<int, kMaxRank> strides;
-  framework::Array<FastDivMod, kMaxRank> divmoders;
+  framework::Array<platform::FastDivMod, kMaxRank> divmoders;
 };
 
 // reduce config
@@ -338,15 +339,11 @@ struct ReduceConfig {
   void SetReduceType() {
     int rank = x_dim.size();
     int reduce_rank = reduce_dim.size();
-    bool is_large_enough = (reduce_num > REDUCE_SPLIT_BOUNDARY / 2) ||
-                           (left_num > REDUCE_SPLIT_BOUNDARY);
-
-    if (rank == reduce_rank) {
-      reduce_type = static_cast<int>(ReduceType::kReduceAll);
-    } else if (rank == 2 && reduce_rank == 1 && reduce_dim[0] == 1) {
+    bool is_last_dim =
+        (rank == 2) && (reduce_rank == 1) && (reduce_dim[0] == 1);
+    if (rank == reduce_rank || is_last_dim) {
       reduce_type = static_cast<int>(ReduceType::kReduceLastDim);
-    } else if (reduce_rank == 1 &&
-               ((rank == 2 && is_large_enough) || rank != 2)) {
+    } else if (reduce_rank == 1) {
       // ReduceFirstDim and reduceSecondDim
       reduce_type = static_cast<int>(ReduceType::kReduceHigherDim);
     } else {
@@ -576,14 +573,15 @@ static __device__ T BlockYReduce(T val, ReduceOp reducer) {
 // eg: x_dim = {nz, ny, nx}, nx != 1, axis can be 0 or 1
 //     if axis = 1 then grid.z = nz, grid.y = ny / block_size, grid.x = nx / 32
 //     else grid.z = 1, grid.y = ny / block_size, grid.x = nx /32
-template <typename Tx, typename Ty, typename ReduceOp, typename TransformOp>
+template <typename Tx, typename Ty, typename MPType, typename ReduceOp,
+          typename TransformOp>
 __device__ void ReduceHigherDim(const Tx* x, Ty* y, ReduceOp reducer,
-                                TransformOp transformer, Ty init,
+                                TransformOp transformer, MPType init,
                                 int reduce_num, int left_num, int block_size) {
   int idx = blockIdx.x * blockDim.x + threadIdx.x;
   int idy = blockIdx.y * block_size;
 
-  Ty reduce_var = init;
+  MPType reduce_var = init;
 
   if (idx < left_num) {
     int loop = reduce_num - idy;
@@ -591,24 +589,24 @@ __device__ void ReduceHigherDim(const Tx* x, Ty* y, ReduceOp reducer,
 
     for (int iy = 0; iy < loop; iy++) {
       int id = (idy + iy) * left_num + idx + blockIdx.z * reduce_num * left_num;
-      reduce_var = reducer(reduce_var, static_cast<Ty>(transformer(x[id])));
+      reduce_var = reducer(reduce_var, static_cast<MPType>(transformer(x[id])));
     }
 
     y[idx + blockIdx.y * left_num + blockIdx.z * gridDim.y * left_num] =
-        reduce_var;
+        static_cast<Ty>(reduce_var);
   }
 }
 
 // when reduce_dim.size() == 1 and reduce_dim[0] == x_dim.size() - 1, or
 // when reduce_dim.size() != 1 and reduce_dim.size() != x_dim.size(), this
 // function will be used
-template <typename Tx, typename Ty, typename ReduceOp, typename TransformOp,
-          typename ReduceIndexCal, typename LeftIndexCal>
+template <typename Tx, typename Ty, typename MPType, typename ReduceOp,
+          typename TransformOp>
 __device__ void ReduceAny(const Tx* x, Ty* y, ReduceOp reducer,
-                          TransformOp transformer, Ty init, int reduce_num,
+                          TransformOp transformer, MPType init, int reduce_num,
                           int left_num, bool reduce_lastdim,
-                          ReduceIndexCal reduce_index_calculator,
-                          LeftIndexCal left_index_calculator) {
+                          const IndexCalculator& reduce_index_calculator,
+                          const IndexCalculator& left_index_calculator) {
   int input_idx, left_idx, stride;
   // the last dim gets involved in reduction
   if (reduce_lastdim) {
@@ -621,9 +619,9 @@ __device__ void ReduceAny(const Tx* x, Ty* y, ReduceOp reducer,
     stride = gridDim.y * blockDim.y;
   }
   // calculate the offset, means the addr where each thread really start.
-  int input_offset = left_index_calculator(left_idx);
+  int input_offset = left_index_calculator.Get(left_idx);
   const Tx* input = x + input_offset;
-  Ty reduce_var = init;
+  MPType reduce_var = init;
 
   // 1. reduce for each thread
   if (left_idx < left_num) {
@@ -634,12 +632,13 @@ __device__ void ReduceAny(const Tx* x, Ty* y, ReduceOp reducer,
 #pragma unroll
       for (int i = 0; i < REDUCE_VEC_SIZE; ++i) {
         int reduce_idx = input_idx + i * stride;
-        int idx_x = reduce_index_calculator(reduce_idx);
+        int idx_x = reduce_index_calculator.Get(reduce_idx);
         input_reg[i] = input[idx_x];
       }
 #pragma unroll
       for (int i = 0; i < REDUCE_VEC_SIZE; ++i) {
-        reduce_var = reducer(reduce_var, transformer(input_reg[i]));
+        reduce_var =
+            reducer(reduce_var, static_cast<MPType>(transformer(input_reg[i])));
       }
       input_idx += REDUCE_VEC_SIZE * stride;
     }
@@ -652,7 +651,7 @@ __device__ void ReduceAny(const Tx* x, Ty* y, ReduceOp reducer,
         break;
       }
       int reduce_idx = input_idx;
-      int idx_x = reduce_index_calculator(reduce_idx);
+      int idx_x = reduce_index_calculator.Get(reduce_idx);
       input_reg[i] = input[idx_x];
       input_idx += stride;
     }
@@ -662,7 +661,8 @@ __device__ void ReduceAny(const Tx* x, Ty* y, ReduceOp reducer,
       if (input_idx >= reduce_num) {
         break;
       }
-      reduce_var = reducer(reduce_var, transformer(input_reg[i]));
+      reduce_var =
+          reducer(reduce_var, static_cast<MPType>(transformer(input_reg[i])));
       input_idx += stride;
     }
   }
@@ -677,63 +677,56 @@ __device__ void ReduceAny(const Tx* x, Ty* y, ReduceOp reducer,
     // 3. reduce in block x
     reduce_var = BlockXReduce(reduce_var, reducer);
     if (left_idx < left_num && threadIdx.x == 0) {
-      y[blockIdx.y * left_num + left_idx] = reduce_var;
+      y[blockIdx.y * left_num + left_idx] = static_cast<Ty>(reduce_var);
     }
   } else {
     if (left_idx < left_num && threadIdx.y == 0) {
-      y[blockIdx.y * left_num + left_idx] = reduce_var;
+      y[blockIdx.y * left_num + left_idx] = static_cast<Ty>(reduce_var);
     }
   }
 }
 
 // module function designed for global function
-template <typename Tx, typename Ty, typename ReduceOp, typename TransformOp>
+template <typename Tx, typename Ty, typename MPType, typename ReduceOp,
+          typename TransformOp>
 __device__ void ReduceModule(const Tx* x, Ty* y, ReduceOp reducer,
-                             TransformOp transformer, Ty init, int reduce_num,
-                             int left_num, int blocking_size, int reduce_type,
-                             bool reduce_lastdim,
+                             TransformOp transformer, MPType init,
+                             int reduce_num, int left_num, int blocking_size,
+                             int reduce_type, bool reduce_lastdim,
                              const IndexCalculator& reduce_index_calculator,
                              const IndexCalculator& left_index_calculator) {
-  if (reduce_type == ReduceType::kReduceLastDim) {
-    ReduceAny<Tx, Ty, ReduceOp, TransformOp>(
+  if (reduce_type == ReduceType::kReduceLastDim ||
+      reduce_type == ReduceType::kReduceAny) {
+    ReduceAny<Tx, Ty, MPType, ReduceOp, TransformOp>(
         x, y, reducer, transformer, init, reduce_num, left_num, reduce_lastdim,
-        [&](int idx) { return idx; },
-        [&](int idx) { return idx * reduce_num; });
-
+        reduce_index_calculator, left_index_calculator);
     // reduce_rank == 1 && reduce_dim[0] != x_dim.size() - 1
   } else if (reduce_type == ReduceType::kReduceHigherDim) {
-    ReduceHigherDim<Tx, Ty, ReduceOp, TransformOp>(
+    ReduceHigherDim<Tx, Ty, MPType, ReduceOp, TransformOp>(
         x, y, reducer, transformer, init, reduce_num, left_num, blocking_size);
-
-    // reduce_rank >= 2
-  } else {
-    ReduceAny<Tx, Ty, ReduceOp, TransformOp>(
-        x, y, reducer, transformer, init, reduce_num, left_num, reduce_lastdim,
-        [&](int idx) { return reduce_index_calculator.Get(idx); },
-        [&](int idx) { return left_index_calculator.Get(idx); });
   }
 }
 
-template <typename Tx, typename Ty, typename ReduceOp, typename TransformOp>
+template <typename Tx, typename Ty, typename MPType, typename ReduceOp,
+          typename TransformOp>
 __global__ void ReduceKernelFunction(const Tx* x, Ty* y, ReduceOp reducer,
-                                     TransformOp transformer, Ty init,
+                                     TransformOp transformer, MPType init,
                                      int reduce_num, int left_num,
                                      int blocking_size, int reduce_type,
                                      bool reduce_lastdim,
                                      IndexCalculator reduce_index_calculator,
                                      IndexCalculator left_index_calculator) {
-  ReduceModule<Tx, Ty, ReduceOp, TransformOp>(
+  ReduceModule<Tx, Ty, MPType, ReduceOp, TransformOp>(
       x, y, reducer, transformer, init, reduce_num, left_num, blocking_size,
       reduce_type, reduce_lastdim, reduce_index_calculator,
       left_index_calculator);
 }
 
-template <typename Tx, typename Ty, typename ReduceOp>
+template <typename Tx, typename Ty, typename MPType, typename ReduceOp>
 static void LaunchReduceKernel(const Tx* x_data, Ty* y_data,
-                               const ReduceOp& reducer, Ty init,
+                               const ReduceOp& reducer, MPType init,
                                gpuStream_t stream, ReduceConfig<Ty> config) {
   using TransformOp = typename ReduceOp::Transformer;
-
   int reduce_rank = config.reduce_strides.size();
   int left_rank = config.left_strides.size();
   auto reduce_index_calculator = IndexCalculator(
@@ -741,7 +734,7 @@ static void LaunchReduceKernel(const Tx* x_data, Ty* y_data,
   auto left_index_calculator = IndexCalculator(
       left_rank, config.left_dim, config.left_strides, config.x_strides);
 
-  ReduceKernelFunction<Tx, Ty, ReduceOp,
+  ReduceKernelFunction<Tx, Ty, MPType, ReduceOp,
                        TransformOp><<<config.grid, config.block, 0, stream>>>(
       x_data, config.output_data, reducer, TransformOp(config.reduce_num), init,
       config.reduce_num, config.left_num, config.blocking_size,
@@ -759,10 +752,11 @@ static void LaunchReduceKernel(const Tx* x_data, Ty* y_data,
       grid = dim3(config.grid.x, 1, config.grid.z);
     }
 
-    ReduceKernelFunction<Ty, Ty, ReduceOp, detail::IdentityFunctor<
-                                               Ty>><<<grid, block, 0, stream>>>(
+    ReduceKernelFunction<
+        Ty, Ty, MPType, ReduceOp,
+        detail::IdentityFunctor<Ty, MPType>><<<grid, block, 0, stream>>>(
         config.output_data, y_data, reducer,
-        detail::IdentityFunctor<Ty>(config.grid.y), init, config.grid.y,
+        detail::IdentityFunctor<Ty, MPType>(config.grid.y), init, config.grid.y,
         config.left_num, config.grid.y, ReduceType::kReduceHigherDim,
         config.reduce_lastdim, reduce_index_calculator, left_index_calculator);
   }
@@ -793,11 +787,12 @@ void TensorReduceFunctorImpl(const framework::Tensor& x, framework::Tensor* y,
   }
 
   config.SetOutputData(y_data, x.place(), &tmp);
-
-  using TransformOp = typename ReduceOp<Tx, Ty>::Transformer;
-  auto reducer = ReduceOp<Tx, Ty>();
-  // launch CUB::Reduce
-  if (config.reduce_type == static_cast<int>(ReduceType::kReduceAll)) {
+  bool use_cub_reduce = (config.left_num == 1) &&
+                        (!std::is_same<Tx, paddle::platform::float16>::value);
+  if (use_cub_reduce) {
+    // launch CUB::Reduce
+    using TransformOp = typename ReduceOp<Tx, Ty>::Transformer;
+    auto reducer = ReduceOp<Tx, Ty>();
     cub::TransformInputIterator<Ty, TransformOp, const Tx*> trans_x(
         x_data, TransformOp(config.reduce_num));
     size_t temp_storage_bytes = 0;
@@ -815,7 +810,9 @@ void TensorReduceFunctorImpl(const framework::Tensor& x, framework::Tensor* y,
     return;
   }
 
-  LaunchReduceKernel<Tx, Ty, ReduceOp<Tx, Ty>>(
+  using MPType = typename details::MPTypeTrait<Ty>::Type;
+  auto reducer = ReduceOp<Tx, MPType>();
+  LaunchReduceKernel<Tx, Ty, MPType, ReduceOp<Tx, MPType>>(
       x_data, y_data, reducer, reducer.initial(), stream, config);
 }
 
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.h b/paddle/fluid/operators/reduce_ops/reduce_op.h
index 1c36cebe70a77e..af01b71adb78e3 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.h
@@ -591,7 +591,6 @@ class ReduceGradOp : public framework::OperatorWithKernel {
         (in_dtype >= 0) ? static_cast<framework::proto::VarType::Type>(in_dtype)
                         : OperatorWithKernel::IndicateVarDataType(
                               ctx, framework::GradVarName("Out"));
-
 #ifdef PADDLE_WITH_MKLDNN
     auto CanMKLDNNReduceGradBeUsed = [&]() {
       auto dx_dims = ctx.Input<Tensor>("X")->dims();
diff --git a/paddle/fluid/operators/reduce_ops/reduce_prod_op_npu.cc b/paddle/fluid/operators/reduce_ops/reduce_prod_op_npu.cc
new file mode 100644
index 00000000000000..834b63f199e37d
--- /dev/null
+++ b/paddle/fluid/operators/reduce_ops/reduce_prod_op_npu.cc
@@ -0,0 +1,101 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the Licnse. */
+
+#include "paddle/fluid/operators/reduce_ops/reduce_prod_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename DeviceContext, typename T>
+class ReduceProdNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* out = ctx.Output<Tensor>("Out");
+    auto dims = ctx.Attr<std::vector<int>>("dim");
+    bool keep_dim = ctx.Attr<bool>("keep_dim");
+    bool reduce_all = ctx.Attr<bool>("reduce_all");
+    int out_dtype = ctx.Attr<int>("out_dtype");
+
+    auto place = ctx.GetPlace();
+
+    framework::Tensor cast_out(x->type());
+    cast_out.Resize(out->dims());
+    cast_out.mutable_data<T>(place);
+
+    auto cast_out_dtype = x->type();
+    if (out_dtype != -1) {
+      cast_out_dtype = static_cast<framework::proto::VarType::Type>(out_dtype);
+    }
+
+    if (x->type() != cast_out_dtype) {
+      if (cast_out_dtype == framework::proto::VarType::FP32) {
+        out->mutable_data<float>(place);
+      } else if (cast_out_dtype == framework::proto::VarType::FP16) {
+        out->mutable_data<paddle::platform::float16>(place);
+      } else if (cast_out_dtype == framework::proto::VarType::INT16) {
+        out->mutable_data<int16_t>(place);
+      } else if (cast_out_dtype == framework::proto::VarType::INT32) {
+        out->mutable_data<int32_t>(place);
+      } else if (cast_out_dtype == framework::proto::VarType::INT64) {
+        out->mutable_data<int64_t>(place);
+      } else if (cast_out_dtype == framework::proto::VarType::FP64) {
+        out->mutable_data<double>(place);
+      } else if (cast_out_dtype == framework::proto::VarType::BOOL) {
+        out->mutable_data<bool>(place);
+      }
+    } else {
+      out->ShareDataWith(cast_out);
+    }
+
+    framework::NPUAttributeMap attr_input = {{"axes", dims},
+                                             {"keep_dims", keep_dim}};
+
+    if (reduce_all) {
+      std::vector<int> dim_vec;
+      for (int i = 0; i < x->dims().size(); i++) {
+        dim_vec.push_back(i);
+      }
+
+      attr_input = {{"axes", dim_vec}, {"keep_dims", keep_dim}};
+    }
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    const auto& runner =
+        NpuOpRunner("ReduceProdD", {*x}, {cast_out}, attr_input);
+    runner.Run(stream);
+
+    if (x->type() != cast_out_dtype) {
+      auto dst_dtype = ConvertToNpuDtype(cast_out_dtype);
+      const auto& runner_cast =
+          NpuOpRunner("Cast", {cast_out}, {*out},
+                      {{"dst_type", static_cast<int>(dst_dtype)}});
+      runner_cast.Run(stream);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OP_NPU_KERNEL(
+    reduce_prod, ops::ReduceProdNPUKernel<plat::NPUDeviceContext, float>,
+    ops::ReduceProdNPUKernel<plat::NPUDeviceContext, plat::float16>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cu b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cu
index efbafe4aa8c3e0..27a29a5b095056 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cu
@@ -11,72 +11,18 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-
-#include "paddle/fluid/operators/reduce_ops/cub_reduce.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_functor_op.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
 #include "paddle/fluid/operators/reduce_ops/reduce_sum_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename Tout>
-struct IdentityFunctor {
-  HOSTDEVICE explicit inline IdentityFunctor() {}
-
-  template <typename U>
-  HOSTDEVICE inline Tout operator()(const U& x) const {
-    return static_cast<Tout>(x);
-  }
-};
-
-template <typename T>
-class ReduceSumKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    bool reduce_all = context.Attr<bool>("reduce_all");
-    auto* input = context.Input<Tensor>("X");
-    auto* output = context.Output<Tensor>("Out");
-    auto out_dtype = context.Attr<int>("out_dtype");
-
-    auto dims = context.Attr<std::vector<int>>("dim");
-    bool keep_dim = context.Attr<bool>("keep_dim");
-
-    std::vector<int> reduce_dims;
-    if (reduce_all) {
-      reduce_dims.resize(input->dims().size());
-      for (int i = 0; i < reduce_dims.size(); ++i) reduce_dims[i] = i;
-    } else {
-      for (auto e : dims) {
-        reduce_dims.push_back(e >= 0 ? e : e + input->dims().size());
-      }
-    }
-
-    int reduce_num = 1;
-    for (int i = 0; i < reduce_dims.size(); ++i) {
-      reduce_num *= input->dims()[reduce_dims[i]];
-    }
-
-    auto stream = context.cuda_device_context().stream();
-    if (out_dtype >= 0) {
-      framework::VisitDataTypeSmall(
-          static_cast<framework::proto::VarType::Type>(out_dtype),
-          TensorReduceFunctor<T, cub::Sum, IdentityFunctor>(
-              *input, output, reduce_dims, static_cast<double>(0.0), cub::Sum(),
-              stream));
-    } else {
-      TensorReduce<T, T, cub::Sum, IdentityFunctor<T>>(
-          *input, output, reduce_dims, static_cast<T>(0), cub::Sum(),
-          IdentityFunctor<T>(), stream);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
 REGISTER_OP_CUDA_KERNEL(
-    reduce_sum, ops::ReduceSumKernel<bool>, ops::ReduceSumKernel<float>,
-    ops::ReduceSumKernel<double>,
-    ops::ReduceSumKernel<paddle::platform::float16>, ops::ReduceSumKernel<int>,
-    ops::ReduceSumKernel<int64_t>,
-    ops::ReduceSumKernel<paddle::platform::complex<float>>,
-    ops::ReduceSumKernel<paddle::platform::complex<double>>);
+    reduce_sum, ops::ReduceCudaKernel<bool, paddle::operators::CustomSum>,
+    ops::ReduceCudaKernel<float, paddle::operators::CustomSum>,
+    ops::ReduceCudaKernel<double, paddle::operators::CustomSum>,
+    ops::ReduceCudaKernel<paddle::platform::float16,
+                          paddle::operators::CustomSum>,
+    ops::ReduceCudaKernel<int, paddle::operators::CustomSum>,
+    ops::ReduceCudaKernel<int64_t, paddle::operators::CustomSum>,
+    ops::ReduceCudaKernel<paddle::platform::complex<float>,
+                          paddle::operators::CustomSum>,
+    ops::ReduceCudaKernel<paddle::platform::complex<double>,
+                          paddle::operators::CustomSum>);
diff --git a/paddle/fluid/operators/sequence_ops/sequence_mask_op_npu.cc b/paddle/fluid/operators/sequence_ops/sequence_mask_op_npu.cc
new file mode 100644
index 00000000000000..aa84da10ad6531
--- /dev/null
+++ b/paddle/fluid/operators/sequence_ops/sequence_mask_op_npu.cc
@@ -0,0 +1,138 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/sequence_ops/sequence_mask_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class SequenceMaskNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Output<Tensor>("Y");
+    int maxlen = ctx.Attr<int>("maxlen");
+
+    if (ctx.HasInput("MaxLenTensor")) {
+      auto max_len_tensor = ctx.Input<Tensor>("MaxLenTensor");
+      PADDLE_ENFORCE_NOT_NULL(max_len_tensor,
+                              platform::errors::InvalidArgument(
+                                  "Input(MaxLenTensor) should not be NULL."
+                                  "But received Input(MaxLenTensor) is NULL"));
+      framework::Tensor temp;
+      TensorCopySync(*max_len_tensor, platform::CPUPlace(), &temp);
+      maxlen = *temp.data<int32_t>();
+      PADDLE_ENFORCE_GT(
+          maxlen, 0,
+          platform::errors::InvalidArgument(
+              "Input(MaxLenTensor) value should be greater than 0. But "
+              "received Input(MaxLenTensor) value = %d.",
+              maxlen));
+    }
+
+    if (maxlen < 0) {
+      auto x_numel = x->numel();
+      std::vector<T> x_vec;
+      framework::TensorToVector(*x, dev_ctx, &x_vec);
+      auto x_data = x_vec.data();
+      maxlen = static_cast<int>(*std::max_element(x_data, x_data + x_numel));
+    }
+    auto y_dim = framework::vectorize<int>(x->dims());
+    y_dim.push_back(maxlen);
+
+    Tensor cast_x;
+    cast_x.mutable_data<int32_t>(x->dims(), ctx.GetPlace());
+    const auto& cast1_runner =
+        NpuOpRunner("Cast", {*x}, {cast_x},
+                    {{"dst_type", ConvertToNpuDtype(cast_x.type())}});
+    cast1_runner.Run(dev_ctx.stream());
+
+    Tensor tmp;
+    tmp.mutable_data<int32_t>(framework::make_ddim({maxlen}), ctx.GetPlace());
+    NpuOpRunner range_runner;
+    range_runner.SetType("Range");
+    range_runner.AddInput(std::vector<int32_t>({0}));
+    range_runner.AddInput(std::vector<int32_t>({maxlen}));
+    range_runner.AddInput(std::vector<int32_t>({1}));
+    range_runner.AddOutput(tmp);
+    range_runner.Run(dev_ctx.stream());
+
+    Tensor expand_tmp;
+    expand_tmp.mutable_data<int32_t>(framework::make_ddim(y_dim),
+                                     ctx.GetPlace());
+    const auto& expand_runner =
+        NpuOpRunner("ExpandD", {tmp}, {expand_tmp}, {{"shape", y_dim}});
+    expand_runner.Run(dev_ctx.stream());
+
+    auto x_dims = framework::vectorize<int>(x->dims());
+    x_dims.push_back(1);
+    cast_x.Resize(framework::make_ddim({x_dims}));
+    Tensor x_tmp;
+    x_tmp.mutable_data<int32_t>(framework::make_ddim(y_dim), ctx.GetPlace());
+    const auto& tile_runner =
+        NpuOpRunner("TileWithAxis", {cast_x}, {x_tmp},
+                    {{"axis", x->dims().size()}, {"tiles", maxlen}});
+    tile_runner.Run(dev_ctx.stream());
+
+    Tensor y_tmp;
+    y_tmp.mutable_data<uint8_t>(framework::make_ddim(y_dim), ctx.GetPlace());
+    const auto& less_runner =
+        NpuOpRunner("Less", {expand_tmp, x_tmp}, {y_tmp}, {});
+    less_runner.Run(dev_ctx.stream());
+
+    y->Resize(framework::make_ddim(y_dim));
+    auto out_dtype = static_cast<framework::proto::VarType::Type>(
+        ctx.Attr<int>("out_dtype"));
+    if (out_dtype == framework::proto::VarType::INT32) {
+      y->mutable_data<int32_t>(ctx.GetPlace());
+    } else if (out_dtype == framework::proto::VarType::INT64) {
+      y->mutable_data<int64_t>(ctx.GetPlace());
+    } else if (out_dtype == framework::proto::VarType::FP32) {
+      y->mutable_data<float>(ctx.GetPlace());
+    } else if (out_dtype == framework::proto::VarType::FP64) {
+      y->mutable_data<double>(ctx.GetPlace());
+    } else if (out_dtype == framework::proto::VarType::BOOL) {
+      y->mutable_data<bool>(ctx.GetPlace());
+    } else if (out_dtype == framework::proto::VarType::UINT8) {
+      y->mutable_data<uint8_t>(ctx.GetPlace());
+    } else {
+      PADDLE_ENFORCE(false,
+                     platform::errors::InvalidArgument(
+                         "out_dtype only supporing int32, int64, fp32, fp64, "
+                         "bool, uint8, but receive out_dtype is %d",
+                         out_dtype));
+    }
+
+    const auto& cast2_runner = NpuOpRunner(
+        "Cast", {y_tmp}, {*y}, {{"dst_type", ConvertToNpuDtype(out_dtype)}});
+    cast2_runner.Run(dev_ctx.stream());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(
+    sequence_mask, ops::SequenceMaskNPUKernel<plat::NPUDeviceContext, int32_t>,
+    ops::SequenceMaskNPUKernel<plat::NPUDeviceContext, int64_t>,
+    ops::SequenceMaskNPUKernel<plat::NPUDeviceContext, float>,
+    ops::SequenceMaskNPUKernel<plat::NPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.h b/paddle/fluid/operators/softmax_with_cross_entropy_op.h
index 74316841a13b17..29528ae0d29925 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.h
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.h
@@ -111,15 +111,12 @@ class SoftmaxWithCrossEntropyGradKernel : public framework::OpKernel<T> {
     const Tensor* labels = context.Input<Tensor>("Label");
     Tensor* logit_grad =
         context.Output<Tensor>(framework::GradVarName("Logits"));
-
     const Tensor* softmax = context.Input<Tensor>("Softmax");
     const bool use_softmax = context.Attr<bool>("use_softmax");
-
     if (logit_grad != softmax || !use_softmax) {
       framework::TensorCopy(*softmax, context.GetPlace(),
                             context.device_context(), logit_grad);
     }
-
     const bool soft_label = context.Attr<bool>("soft_label");
     auto ignore_index = context.Attr<int>("ignore_index");
 
@@ -133,7 +130,6 @@ class SoftmaxWithCrossEntropyGradKernel : public framework::OpKernel<T> {
     logit_grad_2d.ShareDataWith(*logit_grad).Resize({n, d});
     labels_2d.ShareDataWith(*labels).Resize({n, labels->numel() / n});
     out_grad_2d.ShareDataWith(*out_grad).Resize({n, d / axis_dim});
-
     auto out_grad_mat = framework::EigenMatrix<T>::From(out_grad_2d);
     auto logit_grad_mat = framework::EigenMatrix<T>::From(logit_grad_2d);
     auto& place = *context.template device_context<platform::CPUDeviceContext>()
@@ -147,9 +143,8 @@ class SoftmaxWithCrossEntropyGradKernel : public framework::OpKernel<T> {
         logit_grad_mat.device(place) =
             out_grad_mat.broadcast(Eigen::DSizes<int, 2>(1, axis_dim)) *
             logit_grad_mat;
-      }
-      // use_softmax step2
-      else {
+      } else {
+        // use_softmax step2
         const int64_t* label_data = labels->data<int64_t>();
         T* logit_grad_data = logit_grad->data<T>();
         const T* out_grad_data = out_grad->data<T>();
@@ -180,7 +175,6 @@ class SoftmaxWithCrossEntropyGradKernel : public framework::OpKernel<T> {
       }
       return;
     }
-
     // for use_softmax=False, continue
 
     if (soft_label) {
diff --git a/paddle/fluid/operators/squared_l2_norm_op_npu.cc b/paddle/fluid/operators/squared_l2_norm_op_npu.cc
new file mode 100644
index 00000000000000..fb4d8fefda7a7f
--- /dev/null
+++ b/paddle/fluid/operators/squared_l2_norm_op_npu.cc
@@ -0,0 +1,99 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/squared_l2_norm_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class SquaredL2NormNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *x = context.Input<Tensor>("X");
+    auto *out = context.Output<Tensor>("Out");
+
+    auto place = context.GetPlace();
+    auto stream =
+        context.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    std::vector<int> axis;
+    for (int i = 0; i < x->dims().size(); ++i) {
+      axis.push_back(i);
+    }
+    out->mutable_data<T>(place);
+    const auto &runner = NpuOpRunner("SquareSumV1", {*x}, {*out},
+                                     {{"axis", axis}, {"keep_dims", false}});
+    runner.Run(stream);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class SquaredL2NormGradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *x = context.Input<Tensor>("X");
+    auto *x_grad = context.Output<Tensor>(framework::GradVarName("X"));
+    auto *out_grad = context.Input<Tensor>(framework::GradVarName("Out"));
+
+    PADDLE_ENFORCE_EQ(
+        out_grad->numel(), 1,
+        platform::errors::InvalidArgument(
+            "Input(GRAD@Out) of SquaredL2NormGradOP should be a scalar."));
+
+    auto place = context.GetPlace();
+    auto stream =
+        context.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    // broadcast out_grad
+    Tensor broadcasted_out_grad;
+    broadcasted_out_grad.mutable_data<T>(x_grad->dims(), place);
+    const auto &broadcast_runner =
+        NpuOpRunner("BroadcastToD", {*out_grad}, {broadcasted_out_grad},
+                    {{"shape", framework::vectorize(x_grad->dims())}});
+    broadcast_runner.Run(stream);
+    // mul x
+    Tensor tmp_x_grad;
+    tmp_x_grad.mutable_data<T>(x_grad->dims(), place);
+    const auto &mul_x_runner =
+        NpuOpRunner("Mul", {broadcasted_out_grad, *x}, {tmp_x_grad}, {});
+    mul_x_runner.Run(stream);
+    // mul coefficient:2
+    Tensor coefficient;
+    coefficient.mutable_data<T>({1}, place);
+    FillNpuTensorWithConstant<T>(&coefficient, static_cast<T>(2.0));
+    x_grad->mutable_data<T>(place);
+    const auto &mul_coefficient_runner =
+        NpuOpRunner("Mul", {tmp_x_grad, coefficient}, {*x_grad}, {});
+    mul_coefficient_runner.Run(stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(
+    squared_l2_norm,
+    ops::SquaredL2NormNPUKernel<plat::NPUDeviceContext, float>);
+REGISTER_OP_NPU_KERNEL(
+    squared_l2_norm_grad,
+    ops::SquaredL2NormGradNPUKernel<plat::NPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/tile_op.h b/paddle/fluid/operators/tile_op.h
old mode 100755
new mode 100644
index 1fb0fa6ce5176f..260cbc23687313
--- a/paddle/fluid/operators/tile_op.h
+++ b/paddle/fluid/operators/tile_op.h
@@ -17,40 +17,12 @@ limitations under the License. */
 #include <algorithm>
 #include <vector>
 
-#include <boost/preprocessor/arithmetic/div.hpp>
-#include <boost/preprocessor/arithmetic/mod.hpp>
-#include <boost/preprocessor/comparison/greater.hpp>
-#include <boost/preprocessor/comparison/greater_equal.hpp>
-#include <boost/preprocessor/control/if.hpp>
-#include <boost/preprocessor/repetition/repeat.hpp>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/operators/eigen/eigen_function.h"
 
 #define MAX_RANK_SUPPORTED 6
-// 1. BOOST_PP_REPEAT macro represents a fast horizontal repetition construct.
-//    Usage: BOOST_PP_REPEAT(count, macro, data).
-//    This macro expands to the sequence:
-//    macro(z, 0, data) macro(z, 1, data) ... macro(z, count - 1, data).
-// 2. As for our case, count = MAX_RANK_SUPPORTED(which is 6).
-//    So the range of n is 0-5(which is count-1).
-//    We want to generate case 1-6 instead of case 0-5.
-//    So we need to change n to n + 1.
-#define TILE_TEMPLATE(z, n, data) \
-  case n + 1: {                   \
-    Tile<n + 1>(context);         \
-    break;                        \
-  }
-#define REP_TILE_TEMPLATE(n) BOOST_PP_REPEAT(n, TILE_TEMPLATE, ~)
-#define COND(n) BOOST_PP_GREATER_EQUAL(n, BOOST_PP_MOD(n, MAX_RANK_SUPPORTED))
-#define TILE_GRAD_CASE(n)                                            \
-  case n + 1: {                                                      \
-    TileBackward<n + 1>(context, reshape_dims_vec, reduce_dims_vec); \
-    break;                                                           \
-  }
-#define TILE_GRAD_TEMPLATE(z, n, data) BOOST_PP_IF(COND(n), TILE_GRAD_CASE(n), )
-#define REP_TILE_GRAD_TEMPLATE(n) BOOST_PP_REPEAT(n, TILE_GRAD_TEMPLATE, ~)
 
 namespace paddle {
 namespace operators {
@@ -60,7 +32,8 @@ inline std::vector<int> get_repeat_times(
     auto* repeat_tensor = ctx.Input<framework::LoDTensor>("RepeatTimes");
     auto* repeat_data = repeat_tensor->data<int>();
     framework::Tensor cpu_repeat_tensor;
-    if (platform::is_gpu_place(repeat_tensor->place())) {
+    if (platform::is_gpu_place(repeat_tensor->place()) ||
+        platform::is_npu_place(repeat_tensor->place())) {
       TensorCopySync(*repeat_tensor, platform::CPUPlace(), &cpu_repeat_tensor);
       repeat_data = cpu_repeat_tensor.data<int>();
     }
@@ -76,7 +49,8 @@ inline std::vector<int> get_repeat_times(
     std::vector<int> vec_repeat_times;
     for (size_t i = 0; i < list_repeat_times_tensor.size(); ++i) {
       auto tensor = list_repeat_times_tensor[i];
-      if (platform::is_gpu_place(tensor->place())) {
+      if (platform::is_gpu_place(tensor->place()) ||
+          platform::is_npu_place(tensor->place())) {
         framework::Tensor temp;
         TensorCopySync(*tensor, platform::CPUPlace(), &temp);
         vec_repeat_times.push_back(*temp.data<int32_t>());
@@ -130,7 +104,26 @@ class TileKernel : public framework::OpKernel<T> {
             "must be less than or equal to %d, but the value received is %d.",
             MAX_RANK_SUPPORTED, repeat_times_size));
     rank = std::max(rank, repeat_times_size);
-    switch (rank) { REP_TILE_TEMPLATE(MAX_RANK_SUPPORTED) }
+    switch (rank) {
+      case 1:
+        Tile<1>(context);
+        break;
+      case 2:
+        Tile<2>(context);
+        break;
+      case 3:
+        Tile<3>(context);
+        break;
+      case 4:
+        Tile<4>(context);
+        break;
+      case 5:
+        Tile<5>(context);
+        break;
+      case 6:
+        Tile<6>(context);
+        break;
+    }
   }
 
  protected:
@@ -251,7 +244,24 @@ class TileGradKernel : public framework::OpKernel<T> {
                             "to %d, but the value received is %d.",
                             MAX_RANK_SUPPORTED, dims));
       switch (dims) {
-        REP_TILE_GRAD_TEMPLATE(MAX_RANK_SUPPORTED)
+        case 1:
+          TileBackward<1>(context, reshape_dims_vec, reduce_dims_vec);
+          break;
+        case 2:
+          TileBackward<2>(context, reshape_dims_vec, reduce_dims_vec);
+          break;
+        case 3:
+          TileBackward<3>(context, reshape_dims_vec, reduce_dims_vec);
+          break;
+        case 4:
+          TileBackward<4>(context, reshape_dims_vec, reduce_dims_vec);
+          break;
+        case 5:
+          TileBackward<5>(context, reshape_dims_vec, reduce_dims_vec);
+          break;
+        case 6:
+          TileBackward<6>(context, reshape_dims_vec, reduce_dims_vec);
+          break;
         default:
           PADDLE_THROW(platform::errors::InvalidArgument(
               "Only support tensor with rank being between 1 and 6. But "
diff --git a/paddle/fluid/operators/tile_op_npu.cc b/paddle/fluid/operators/tile_op_npu.cc
new file mode 100644
index 00000000000000..c85a1cbc671af1
--- /dev/null
+++ b/paddle/fluid/operators/tile_op_npu.cc
@@ -0,0 +1,118 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/tile_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+template <typename DeviceContext, typename T>
+class TileNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto rank = context.Input<Tensor>("X")->dims().size();
+    PADDLE_ENFORCE_GE(
+        rank, 1, platform::errors::InvalidArgument(
+                     "The rank of the input 'x' for tile op must be a positive "
+                     "integer, but the value received is %d.",
+                     rank));
+    PADDLE_ENFORCE_LE(
+        rank, MAX_RANK_SUPPORTED,
+        platform::errors::InvalidArgument(
+            "The rank of the input 'x' for tile op "
+            "must be less than or equal to %d, but the value received is %d.",
+            MAX_RANK_SUPPORTED, rank));
+    auto repeat_times = get_repeat_times(context);
+    int repeat_times_size = repeat_times.size();
+    PADDLE_ENFORCE_GE(
+        repeat_times_size, 1,
+        platform::errors::InvalidArgument(
+            "The number of elements of the input 'repeat_times' for tile "
+            "op must be positive, but the value received is %d.",
+            repeat_times_size));
+    PADDLE_ENFORCE_LE(
+        repeat_times_size, MAX_RANK_SUPPORTED,
+        platform::errors::InvalidArgument(
+            "The number of elements of the input 'repeat_times' for tile op "
+            "must be less than or equal to %d, but the value received is %d.",
+            MAX_RANK_SUPPORTED, repeat_times_size));
+    rank = std::max(rank, repeat_times_size);
+    Tile(context);
+  }
+
+ protected:
+  void Tile(const framework::ExecutionContext& context) const {
+    auto* in0 = context.Input<framework::Tensor>("X");
+
+    auto in_dims = in0->dims();
+    auto repeat_times = get_repeat_times(context);
+    for (size_t i = 0; i < repeat_times.size(); ++i) {
+      PADDLE_ENFORCE_GT(
+          repeat_times[i], 0,
+          platform::errors::InvalidArgument(
+              "All elements of the input 'repeat_times' for tile op must "
+              "be positive integers, but the value received is %d.",
+              repeat_times[i]));
+    }
+    auto vec_in_dims = framework::vectorize<int>(in_dims);
+    if (repeat_times.size() < vec_in_dims.size()) {
+      int diff = vec_in_dims.size() - repeat_times.size();
+      repeat_times.insert(repeat_times.begin(), diff, 1);
+    } else {
+      int diff = repeat_times.size() - vec_in_dims.size();
+      vec_in_dims.insert(vec_in_dims.begin(), diff, 1);
+    }
+    PADDLE_ENFORCE_EQ(
+        repeat_times.size(), vec_in_dims.size(),
+        platform::errors::InvalidArgument(
+            "The rank (%d) of the input 'x' and the rank (%d) of the input "
+            "'repeat_times' for tile op must match after promotion.",
+            vec_in_dims.size(), repeat_times.size()));
+    auto* out0 = context.Output<framework::Tensor>("Out");
+
+    framework::DDim new_in_dims = framework::make_ddim(vec_in_dims);
+    framework::DDim out_dims(new_in_dims);
+
+    for (size_t i = 0; i < repeat_times.size(); ++i) {
+      out_dims[i] *= repeat_times[i];
+    }
+
+    out0->Resize(out_dims);
+    out0->mutable_data<T>(context.GetPlace());
+
+    std::vector<int> temp(repeat_times.size(), 1);
+    if (repeat_times == temp) {
+      framework::TensorCopy(
+          *in0, context.GetPlace(),
+          context.template device_context<platform::DeviceContext>(), out0);
+      return;
+    }
+
+    const auto& runner =
+        NpuOpRunner("TileD", {*in0}, {*out0}, {{"multiples", repeat_times}});
+    auto stream =
+        context.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    runner.Run(stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_NPU_KERNEL(
+    tile, ops::TileNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::TileNPUKernel<paddle::platform::NPUDeviceContext, int>,
+    ops::TileNPUKernel<paddle::platform::NPUDeviceContext,
+                       paddle::platform::float16>);
diff --git a/paddle/fluid/operators/top_k_v2_op_npu.cc b/paddle/fluid/operators/top_k_v2_op_npu.cc
new file mode 100755
index 00000000000000..e536055013fb88
--- /dev/null
+++ b/paddle/fluid/operators/top_k_v2_op_npu.cc
@@ -0,0 +1,94 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/top_k_v2_op.h"
+#include <string>
+#include <vector>
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+// NOTE(Ruibiao): the Ascend TopKV2 operator used in this kernel
+// may lead to large accuracy error for float32 data
+template <typename T>
+class TopkV2NPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* input = context.Input<Tensor>("X");
+    auto* k_tensor = context.Input<Tensor>("K");
+    auto* out = context.Output<Tensor>("Out");
+    auto* indices = context.Output<Tensor>("Indices");  // type: INT64
+
+    int32_t k = static_cast<int32_t>(context.Attr<int>("k"));
+    int axis = static_cast<int>(context.Attr<int>("axis"));
+    const bool sorted = static_cast<bool>(context.Attr<bool>("sorted"));
+    const bool largest = static_cast<bool>(context.Attr<bool>("largest"));
+
+    if (axis < 0) {
+      axis += input->dims().size();
+    }
+
+    if (k_tensor != nullptr) {
+      std::vector<int> v_tmp(1);
+      TensorToVector(
+          *k_tensor,
+          context.template device_context<paddle::platform::NPUDeviceContext>(),
+          &v_tmp);
+      k = static_cast<int32_t>(v_tmp[0]);
+    }
+
+    framework::DDim output_dims = input->dims();
+    output_dims[axis] = k;
+
+    out->Resize(output_dims);
+    indices->Resize(output_dims);
+
+    out->mutable_data<T>(context.GetPlace());
+    indices->mutable_data<int64_t>(context.GetPlace());
+
+    framework::Tensor indices_int32(framework::proto::VarType::INT32);
+    indices_int32.Resize(output_dims);
+    indices_int32.mutable_data<int32_t>(context.GetPlace());
+
+    auto npu_stream =
+        context.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    NpuOpRunner npu_op_runner_topkv2;
+    npu_op_runner_topkv2.SetType("TopKV2")
+        .AddInput(*input)
+        .AddInput(std::vector<int32_t>{k})
+        .AddOutput(*out)
+        .AddOutput(indices_int32)
+        .AddAttr("sorted", sorted)
+        .AddAttr("dim", axis)
+        .AddAttr("largest", largest)
+        .Run(npu_stream);
+
+    // Cast 'indices_int32' to 'indices', from INT32 to INT64
+    auto dst_dtype = ConvertToNpuDtype(indices->type());
+    const auto& npu_op_runner_cast =
+        NpuOpRunner("Cast", {indices_int32}, {*indices},
+                    {{"dst_type", static_cast<int>(dst_dtype)}});
+    npu_op_runner_cast.Run(npu_stream);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_NPU_KERNEL(top_k_v2, ops::TopkV2NPUKernel<float>,
+                       ops::TopkV2NPUKernel<double>,
+                       ops::TopkV2NPUKernel<int32_t>,
+                       ops::TopkV2NPUKernel<int64_t>);
diff --git a/paddle/fluid/operators/unfold_op.cc b/paddle/fluid/operators/unfold_op.cc
index 0a36b6ef840887..5c0eb64993b556 100644
--- a/paddle/fluid/operators/unfold_op.cc
+++ b/paddle/fluid/operators/unfold_op.cc
@@ -107,6 +107,42 @@ class UnfoldOp : public framework::OperatorWithKernel {
             "But recieved dims(strides: %u) != dims(dilations: %u).",
             strides.size(), dilations.size()));
 
+    // check kernel_sizes
+    PADDLE_ENFORCE_GT(kernel_sizes[0], 0,
+                      platform::errors::InvalidArgument(
+                          "The `kernel_sizes` should be greater than zero, "
+                          "but recieved kernel_height: %d kernel_width: %d.",
+                          kernel_sizes[0], kernel_sizes[1]));
+    PADDLE_ENFORCE_GT(kernel_sizes[1], 0,
+                      platform::errors::InvalidArgument(
+                          "The `kernel_sizes` should be greater than zero, "
+                          "but recieved kernel_height: %d kernel_width: %d.",
+                          kernel_sizes[0], kernel_sizes[1]));
+    // check strides
+    PADDLE_ENFORCE_GT(strides[0], 0,
+                      platform::errors::InvalidArgument(
+                          "The `strides` should be greater than zero, "
+                          "but recieved strides_height: %d strides_width: %d.",
+                          strides[0], strides[1]));
+    PADDLE_ENFORCE_GT(strides[1], 0,
+                      platform::errors::InvalidArgument(
+                          "The `strides` should be greater than zero, "
+                          "but recieved strides_height: %d strides_width: %d.",
+                          strides[0], strides[1]));
+    // check dilations
+    PADDLE_ENFORCE_GT(
+        dilations[0], 0,
+        platform::errors::InvalidArgument(
+            "The `dilations` should be greater than zero, "
+            "but recieved dilations_height: %d dilations_width: %d.",
+            dilations[0], dilations[1]));
+    PADDLE_ENFORCE_GT(
+        dilations[1], 0,
+        platform::errors::InvalidArgument(
+            "The `dilations` should be greater than zero, "
+            "but recieved dilations_height: %d dilations_width: %d.",
+            dilations[0], dilations[1]));
+
     std::vector<int> out_dims;
     out_dims.push_back(in_dims[0]);
 
diff --git a/paddle/fluid/operators/warpctc_op.h b/paddle/fluid/operators/warpctc_op.h
index e90eefd72d4ce2..f5b51da3d85831 100644
--- a/paddle/fluid/operators/warpctc_op.h
+++ b/paddle/fluid/operators/warpctc_op.h
@@ -199,6 +199,27 @@ class WarpCTCKernel : public framework::OpKernel<T> {
       sequence_width = logits->dims()[2];
       max_sequence_length = logits->dims()[0];
 
+      PADDLE_ENFORCE_GT(max_sequence_length, 0,
+                        platform::errors::InvalidArgument(
+                            "The first dimension of Input(Logits) should be "
+                            "greater than zero "
+                            "but received %d. ",
+                            max_sequence_length));
+
+      PADDLE_ENFORCE_GT(num_sequences, 0,
+                        platform::errors::InvalidArgument(
+                            "The second dimension of Input(Logits) should be "
+                            "greater than zero "
+                            "but received %d. ",
+                            num_sequences));
+
+      PADDLE_ENFORCE_GT(sequence_width, 0,
+                        platform::errors::InvalidArgument(
+                            "The third dimension of Input(Logits) should be "
+                            "greater than zero "
+                            "but received %d. ",
+                            sequence_width));
+
       auto* logits_length = ctx.Input<framework::Tensor>("LogitsLength");
       auto* labels_length = ctx.Input<framework::Tensor>("LabelLength");
       framework::Tensor logits_length_cpu;
@@ -229,6 +250,13 @@ class WarpCTCKernel : public framework::OpKernel<T> {
       logits_lod = framework::ToAbsOffset(logits->lod())[0];
       auto logits_dims = logits->dims();
 
+      PADDLE_ENFORCE_GT(logits_dims[0], 0,
+                        platform::errors::InvalidArgument(
+                            "The first dimension of Input(Logits) should be "
+                            "greater than zero "
+                            "but received %d. ",
+                            logits_dims[0]));
+
       PADDLE_ENFORCE_EQ(
           logits_dims[0], static_cast<int64_t>(logits_lod.back()),
           platform::errors::InvalidArgument(
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index efd25bc8929409..97c81568e673e8 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -70,7 +70,7 @@ cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
 
 if(WITH_XPU)
 cc_library(xpu_info SRCS xpu/xpu_info.cc DEPS gflags glog enforce xpulib)
-cc_library(xpu_op_list SRCS xpu/xpu_op_list.cc DEPS gflags glog enforce xpulib)
+cc_library(xpu_op_list SRCS xpu/xpu_op_list.cc DEPS gflags glog enforce xpulib device_context)
 endif()
 
 if(WITH_ASCEND)
diff --git a/paddle/fluid/platform/cudnn_workspace_helper.cc b/paddle/fluid/platform/cudnn_workspace_helper.cc
index c4e71c86f9e750..bb0e9a226d1500 100644
--- a/paddle/fluid/platform/cudnn_workspace_helper.cc
+++ b/paddle/fluid/platform/cudnn_workspace_helper.cc
@@ -15,13 +15,14 @@
 #include "paddle/fluid/platform/cudnn_workspace_helper.h"
 
 #include <cstdlib>
-#include "boost/lexical_cast.hpp"
+#include <string>
+
 namespace paddle {
 namespace platform {
 
 static int GetDefaultConvWorkspaceSizeLimitMBImpl() {
   const char *env_str = std::getenv("FLAGS_conv_workspace_size_limit");
-  return env_str ? boost::lexical_cast<int>(std::string(env_str))
+  return env_str ? std::stoi(std::string(env_str))
                  : kDefaultConvWorkspaceSizeLimitMB;
 }
 
diff --git a/paddle/fluid/platform/event.h b/paddle/fluid/platform/event.h
index 93fc56ab203b60..a79ab22743d166 100644
--- a/paddle/fluid/platform/event.h
+++ b/paddle/fluid/platform/event.h
@@ -120,6 +120,7 @@ class MemEvent {
 
 class CudaEvent {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+
  public:
   CudaEvent() {
 #ifdef PADDLE_WITH_HIP
@@ -129,7 +130,7 @@ class CudaEvent {
 #endif
   }
 
-  CudaEvent(unsigned int flags) : flags_(flags) {
+  explicit CudaEvent(unsigned int flags) : flags_(flags) {
 #ifdef PADDLE_WITH_HIP
     hipEventCreateWithFlags(&event_, flags_);
 #else
@@ -137,7 +138,15 @@ class CudaEvent {
 #endif
   }
 
-  void Record(paddle::platform::stream::CUDAStream& stream) {
+  ~CudaEvent() {
+#ifdef PADDLE_WITH_HIP
+    hipEventDestroy(event_);
+#else
+    cudaEventDestroy(event_);
+#endif
+  }
+
+  void Record(const paddle::platform::stream::CUDAStream& stream) {
 #ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_CUDA_SUCCESS(hipEventRecord(event_, stream.raw_stream()));
 #else
diff --git a/paddle/fluid/platform/fast_divmod.h b/paddle/fluid/platform/fast_divmod.h
index c6c22bb2f9203b..02f9d5441281c1 100644
--- a/paddle/fluid/platform/fast_divmod.h
+++ b/paddle/fluid/platform/fast_divmod.h
@@ -20,7 +20,7 @@ limitations under the License. */
 #define INT_BITS 32
 
 namespace paddle {
-namespace operators {
+namespace platform {
 
 template <typename T, int Size>
 struct alignas(sizeof(T) * Size) CudaAlignedVector {
@@ -65,5 +65,39 @@ struct FastDivMod {
   uint32_t multiplier;
 };
 
-}  // namespace operators
+/*
+* Only the address of input data is the multiplier of 1,2,4, vectorized load
+* with corresponding multiplier-value is possible. Moreover, the maximum length
+* of vectorized load is 128 bits once. Hence, valid length of vectorized load
+* shall be determined under both former constraints.
+*/
+template <typename T>
+int GetVectorizedSize(const T *pointer) {
+  constexpr int max_load_bits = 128;
+  int valid_vec_size = max_load_bits / CHAR_BIT / sizeof(T);
+  uint64_t address = reinterpret_cast<uint64_t>(pointer);
+  constexpr int vec8 =
+      std::alignment_of<CudaAlignedVector<T, 8>>::value;  // NOLINT
+  constexpr int vec4 =
+      std::alignment_of<CudaAlignedVector<T, 4>>::value;  // NOLINT
+  constexpr int vec2 =
+      std::alignment_of<CudaAlignedVector<T, 2>>::value;  // NOLINT
+  if (address % vec8 == 0) {
+    /*
+    * Currently, decide to deal with no more than 4 data once while adopting
+    * vectorization load/store, if performance test shows that dealing with
+    * 8 data once in vectorization load/store does get optimized, return code
+    * below can be changed into " return std::min(8, valid_vec_size); " .
+    */
+    return std::min(4, valid_vec_size);
+  } else if (address % vec4 == 0) {
+    return std::min(4, valid_vec_size);
+  } else if (address % vec2 == 0) {
+    return std::min(2, valid_vec_size);
+  } else {
+    return 1;
+  }
+}
+
+}  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/xpu/xpu1_op_list.h b/paddle/fluid/platform/xpu/xpu1_op_list.h
index 131525718cac75..cdd60a856fbc90 100644
--- a/paddle/fluid/platform/xpu/xpu1_op_list.h
+++ b/paddle/fluid/platform/xpu/xpu1_op_list.h
@@ -55,25 +55,51 @@ XPUOpMap& get_kl1_ops() {
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"affine_channel_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"assign", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"assign", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                               pOpKernelType(vartype::FP64, XPUPlace()),
+                               pOpKernelType(vartype::INT32, XPUPlace()),
+                               pOpKernelType(vartype::INT64, XPUPlace()),
+                               pOpKernelType(vartype::BOOL, XPUPlace())})},
       {"batch_norm", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"batch_norm_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"cast", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"cast", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                             pOpKernelType(vartype::INT64, XPUPlace()),
+                             pOpKernelType(vartype::INT32, XPUPlace())})},
       {"clip_by_norm",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"coalesce_tensor",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                     pOpKernelType(vartype::FP64, XPUPlace()),
+                     pOpKernelType(vartype::INT32, XPUPlace())})},
       {"c_reduce_sum",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"c_allreduce_sum",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"broadcast", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"broadcast", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                                  pOpKernelType(vartype::FP64, XPUPlace()),
+                                  pOpKernelType(vartype::INT32, XPUPlace()),
+                                  pOpKernelType(vartype::INT64, XPUPlace())})},
       {"concat", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"concat_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"logicalor", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"logicaland", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"logicalnot", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"logicalor", XPUKernelSet({pOpKernelType(vartype::BOOL, XPUPlace()),
+                                  pOpKernelType(vartype::INT8, XPUPlace()),
+                                  pOpKernelType(vartype::INT16, XPUPlace()),
+                                  pOpKernelType(vartype::INT32, XPUPlace()),
+                                  pOpKernelType(vartype::INT64, XPUPlace()),
+                                  pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"logicaland", XPUKernelSet({pOpKernelType(vartype::BOOL, XPUPlace()),
+                                   pOpKernelType(vartype::INT8, XPUPlace()),
+                                   pOpKernelType(vartype::INT16, XPUPlace()),
+                                   pOpKernelType(vartype::INT32, XPUPlace()),
+                                   pOpKernelType(vartype::INT64, XPUPlace()),
+                                   pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"logicalnot", XPUKernelSet({pOpKernelType(vartype::BOOL, XPUPlace()),
+                                   pOpKernelType(vartype::INT8, XPUPlace()),
+                                   pOpKernelType(vartype::INT16, XPUPlace()),
+                                   pOpKernelType(vartype::INT32, XPUPlace()),
+                                   pOpKernelType(vartype::INT64, XPUPlace()),
+                                   pOpKernelType(vartype::FP32, XPUPlace())})},
       {"depthwise_conv2d",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"depthwise_conv2d_grad",
@@ -116,7 +142,11 @@ XPUOpMap& get_kl1_ops() {
       {"elementwise_min_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"fill_constant",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+       XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
+                     pOpKernelType(vartype::INT64, XPUPlace()),
+                     pOpKernelType(vartype::FP64, XPUPlace()),
+                     pOpKernelType(vartype::BOOL, XPUPlace()),
+                     pOpKernelType(vartype::FP32, XPUPlace())})},
       {"gather", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"gather_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"gaussian_random",
@@ -140,7 +170,11 @@ XPUOpMap& get_kl1_ops() {
       {"layer_norm", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"layer_norm_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"load", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"load", XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
+                             pOpKernelType(vartype::INT8, XPUPlace()),
+                             pOpKernelType(vartype::INT32, XPUPlace()),
+                             pOpKernelType(vartype::INT64, XPUPlace()),
+                             pOpKernelType(vartype::FP32, XPUPlace())})},
       {"log_loss", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"log_loss_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
@@ -158,15 +192,20 @@ XPUOpMap& get_kl1_ops() {
       {"accuracy", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"mul", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"mul_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"one_hot", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"one_hot_v2", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"one_hot", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
+                                pOpKernelType(vartype::INT64, XPUPlace())})},
+      {"one_hot_v2", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
+                                   pOpKernelType(vartype::INT64, XPUPlace())})},
       {"sgd", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"adam", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"rmsprop", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"lamb", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"pool2d", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"pool2d_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"range", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"range", XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
+                              pOpKernelType(vartype::INT64, XPUPlace()),
+                              pOpKernelType(vartype::INT32, XPUPlace()),
+                              pOpKernelType(vartype::FP32, XPUPlace())})},
       {"reduce_sum", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"reduce_sum_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
@@ -175,30 +214,67 @@ XPUOpMap& get_kl1_ops() {
       {"reduce_max", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"reduce_max_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"reshape2", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"reshape2", XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
+                                 pOpKernelType(vartype::INT64, XPUPlace()),
+                                 pOpKernelType(vartype::INT32, XPUPlace()),
+                                 pOpKernelType(vartype::BOOL, XPUPlace()),
+                                 pOpKernelType(vartype::FP32, XPUPlace())})},
       {"reshape2_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+       XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
+                     pOpKernelType(vartype::INT64, XPUPlace()),
+                     pOpKernelType(vartype::INT32, XPUPlace()),
+                     pOpKernelType(vartype::BOOL, XPUPlace()),
+                     pOpKernelType(vartype::FP32, XPUPlace())})},
       {"rnn", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"rnn_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"roi_align", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"roi_align_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"scale", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"shape", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"shape", XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
+                              pOpKernelType(vartype::INT64, XPUPlace()),
+                              pOpKernelType(vartype::INT32, XPUPlace()),
+                              pOpKernelType(vartype::BOOL, XPUPlace()),
+                              pOpKernelType(vartype::FP32, XPUPlace())})},
       {"sign", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"slice", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"slice", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                              pOpKernelType(vartype::INT32, XPUPlace())})},
       {"slice_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"softmax", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"softmax_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"softmax_with_cross_entropy",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"squeeze", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"squeeze", XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
+                                pOpKernelType(vartype::INT64, XPUPlace()),
+                                pOpKernelType(vartype::INT32, XPUPlace()),
+                                pOpKernelType(vartype::BOOL, XPUPlace()),
+                                pOpKernelType(vartype::INT8, XPUPlace()),
+                                pOpKernelType(vartype::UINT8, XPUPlace()),
+                                pOpKernelType(vartype::FP32, XPUPlace())})},
       {"squeeze_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"squeeze2", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+       XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
+                     pOpKernelType(vartype::INT64, XPUPlace()),
+                     pOpKernelType(vartype::INT32, XPUPlace()),
+                     pOpKernelType(vartype::BOOL, XPUPlace()),
+                     pOpKernelType(vartype::INT8, XPUPlace()),
+                     pOpKernelType(vartype::UINT8, XPUPlace()),
+                     pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"squeeze2", XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
+                                 pOpKernelType(vartype::INT64, XPUPlace()),
+                                 pOpKernelType(vartype::INT32, XPUPlace()),
+                                 pOpKernelType(vartype::BOOL, XPUPlace()),
+                                 pOpKernelType(vartype::INT8, XPUPlace()),
+                                 pOpKernelType(vartype::UINT8, XPUPlace()),
+                                 pOpKernelType(vartype::FP32, XPUPlace())})},
       {"squeeze2_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+       XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
+                     pOpKernelType(vartype::INT64, XPUPlace()),
+                     pOpKernelType(vartype::INT32, XPUPlace()),
+                     pOpKernelType(vartype::BOOL, XPUPlace()),
+                     pOpKernelType(vartype::INT8, XPUPlace()),
+                     pOpKernelType(vartype::UINT8, XPUPlace()),
+                     pOpKernelType(vartype::FP32, XPUPlace())})},
       {"stack", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"sum", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"top_k", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
@@ -212,12 +288,36 @@ XPUOpMap& get_kl1_ops() {
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"uniform_random",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"unsqueeze", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"unsqueeze", XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
+                                  pOpKernelType(vartype::INT64, XPUPlace()),
+                                  pOpKernelType(vartype::INT32, XPUPlace()),
+                                  pOpKernelType(vartype::BOOL, XPUPlace()),
+                                  pOpKernelType(vartype::INT8, XPUPlace()),
+                                  pOpKernelType(vartype::UINT8, XPUPlace()),
+                                  pOpKernelType(vartype::FP32, XPUPlace())})},
       {"unsqueeze_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"unsqueeze2", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+       XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
+                     pOpKernelType(vartype::INT64, XPUPlace()),
+                     pOpKernelType(vartype::INT32, XPUPlace()),
+                     pOpKernelType(vartype::BOOL, XPUPlace()),
+                     pOpKernelType(vartype::INT8, XPUPlace()),
+                     pOpKernelType(vartype::UINT8, XPUPlace()),
+                     pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"unsqueeze2", XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
+                                   pOpKernelType(vartype::INT64, XPUPlace()),
+                                   pOpKernelType(vartype::INT32, XPUPlace()),
+                                   pOpKernelType(vartype::BOOL, XPUPlace()),
+                                   pOpKernelType(vartype::INT8, XPUPlace()),
+                                   pOpKernelType(vartype::UINT8, XPUPlace()),
+                                   pOpKernelType(vartype::FP32, XPUPlace())})},
       {"unsqueeze2_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+       XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
+                     pOpKernelType(vartype::INT64, XPUPlace()),
+                     pOpKernelType(vartype::INT32, XPUPlace()),
+                     pOpKernelType(vartype::BOOL, XPUPlace()),
+                     pOpKernelType(vartype::INT8, XPUPlace()),
+                     pOpKernelType(vartype::UINT8, XPUPlace()),
+                     pOpKernelType(vartype::FP32, XPUPlace())})},
       {"momuntem", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}
       // AddMore
   };
diff --git a/paddle/fluid/platform/xpu/xpu_op_list.cc b/paddle/fluid/platform/xpu/xpu_op_list.cc
index b3349407942bd1..0c10436f397898 100644
--- a/paddle/fluid/platform/xpu/xpu_op_list.cc
+++ b/paddle/fluid/platform/xpu/xpu_op_list.cc
@@ -9,7 +9,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #ifdef PADDLE_WITH_XPU
+#include <mutex>
 #include <string>
+#include <unordered_set>
 
 #include "paddle/fluid/platform/xpu/xpu1_op_list.h"
 #include "paddle/fluid/platform/xpu/xpu2_op_list.h"
@@ -19,7 +21,7 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 
-bool is_xpu_support_op(std::string op_name, const pOpKernelType& type) {
+bool is_xpu_support_op(const std::string& op_name, const pOpKernelType& type) {
   auto& ops = get_kl1_ops();
   auto v =
       get_xpu_version(BOOST_GET_CONST(platform::XPUPlace, type.place_).device);
@@ -34,6 +36,45 @@ bool is_xpu_support_op(std::string op_name, const pOpKernelType& type) {
   return false;
 }
 
+// ops_string contains op_list(e.g., 'mul,mul_grad'), parse the op string and
+// insert op to op set
+static void tokenize(const std::string& ops, char delim,
+                     std::unordered_set<std::string>* op_set) {
+  std::string::size_type beg = 0;
+  for (uint64_t end = 0; (end = ops.find(delim, end)) != std::string::npos;
+       ++end) {
+    op_set->insert(ops.substr(beg, end - beg));
+    beg = end + 1;
+  }
+
+  op_set->insert(ops.substr(beg));
+}
+
+bool is_in_xpu_black_list(const std::string& op_name) {
+  static bool inited = false;
+  static std::unordered_set<std::string> xpu_black_list;
+  static std::mutex s_mtx;
+  if (!inited) {
+    std::lock_guard<std::mutex> guard(s_mtx);
+    if (!inited) {
+      if (std::getenv("XPU_BLACK_LIST") != nullptr) {
+        std::string ops(std::getenv("XPU_BLACK_LIST"));
+        tokenize(ops, ',', &xpu_black_list);
+      }
+      inited = true;
+      VLOG(3) << "XPU Black List: ";
+      for (auto iter = xpu_black_list.begin(); iter != xpu_black_list.end();
+           ++iter) {
+        VLOG(3) << *iter << " ";
+      }
+    }
+  }
+  if (xpu_black_list.find(op_name) != xpu_black_list.end()) {
+    return true;
+  }
+  return false;
+}
+
 }  // namespace platform
 }  // namespace paddle
 #endif
diff --git a/paddle/fluid/platform/xpu/xpu_op_list.h b/paddle/fluid/platform/xpu/xpu_op_list.h
index 487bc8ac48b66f..705f701e13634a 100644
--- a/paddle/fluid/platform/xpu/xpu_op_list.h
+++ b/paddle/fluid/platform/xpu/xpu_op_list.h
@@ -20,7 +20,8 @@ namespace platform {
 
 using pOpKernelType = paddle::framework::OpKernelType;
 
-bool is_xpu_support_op(std::string op_name, const pOpKernelType& type);
+bool is_xpu_support_op(const std::string& op_name, const pOpKernelType& type);
+bool is_in_xpu_black_list(const std::string& op_name);
 
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index f362808a4b9528..b8774f429632e2 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -1,6 +1,7 @@
 # Adapt to custom op mechanism: Include the header files related to the data type
 # to avoid exposing the path of the underlying file
 include_directories(${PADDLE_SOURCE_DIR}/paddle/fluid/platform)
+include_directories(${PADDLE_SOURCE_DIR}/paddle/utils)
 
 set(PYBIND_DEPS pybind python proto_desc memory executor fleet_wrapper box_wrapper prune
   feed_fetch_method pass pass_builder parallel_executor profiler layer tracer engine scope_pool
@@ -124,23 +125,20 @@ if(WITH_PYTHON)
   set(impl_file ${CMAKE_SOURCE_DIR}/paddle/fluid/pybind/op_function_impl.h)
   set(tmp_impl_file ${impl_file}.tmp)
 
+  set(OP_IMPL_DEPS op_function_generator)
   if(WIN32)
     if("${CMAKE_GENERATOR}" STREQUAL "Ninja")
-      set(op_function_generator_path "${CMAKE_CURRENT_BINARY_DIR}")
+      set(op_impl_path "${CMAKE_CURRENT_BINARY_DIR}")
     else()
-      set(op_function_generator_path "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}")
+      set(op_impl_path "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}")
     endif()
-    file(TO_NATIVE_PATH ${op_function_generator_path} op_function_generator_path)
-    file(TO_NATIVE_PATH ${impl_file} impl_file)
-    file(TO_NATIVE_PATH ${tmp_impl_file} tmp_impl_file)
 
     file(WRITE ${CMAKE_BINARY_DIR}/paddle/fluid/pybind/op_function_generator_retry.bat ""
     "set build_times=1\n"
     ":retry\n"
     "ECHO op_function_generator run %build_times% time\n"
-    "if exist ${tmp_impl_file} del ${tmp_impl_file}\n"
     "taskkill /f /im op_function_generator.exe 2>NUL\n"
-    "${op_function_generator_path}\\op_function_generator.exe ${tmp_impl_file}\n"
+    "${op_impl_path}/op_function_generator.exe ${tmp_impl_file}\n"
     "if %ERRORLEVEL% NEQ 0 (\n"
     "    set /a build_times=%build_times%+1\n"
     "    if %build_times% GEQ 10 (\n"
@@ -151,63 +149,61 @@ if(WITH_PYTHON)
     ")\n"
     "exit /b 0")
 
-    add_custom_command(TARGET op_function_generator POST_BUILD
-          COMMAND ${CMAKE_BINARY_DIR}/paddle/fluid/pybind/op_function_generator_retry.bat
-          COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file} ${impl_file}
-          COMMENT "copy_if_different ${tmp_impl_file} to ${impl_file}"
-    )
-
     if(${CBLAS_PROVIDER} STREQUAL MKLML)
-      add_custom_command(TARGET op_function_generator
-            PRE_LINK
-            COMMAND ${CMAKE_COMMAND} -E copy ${MKLML_SHARED_LIB} ${op_function_generator_path}
-            COMMAND ${CMAKE_COMMAND} -E copy ${MKLML_SHARED_IOMP_LIB} ${op_function_generator_path}
-          )
+      ADD_CUSTOM_COMMAND(OUTPUT ${op_impl_path}/libiomp5md.dll
+        COMMAND ${CMAKE_COMMAND} -E copy ${MKLML_SHARED_IOMP_LIB} ${op_impl_path}
+        DEPENDS mklml)
+      list(APPEND OP_IMPL_DEPS ${op_impl_path}/libiomp5md.dll)
     else(${CBLAS_PROVIDER} STREQUAL EXTERN_OPENBLAS)
-      add_custom_command(TARGET op_function_generator
-            PRE_LINK
-            COMMAND ${CMAKE_COMMAND} -E copy ${OPENBLAS_SHARED_LIB} ${op_function_generator_path}
-          )
+      ADD_CUSTOM_COMMAND(OUTPUT ${op_impl_path}/openblas.dll
+        COMMAND ${CMAKE_COMMAND} -E copy ${OPENBLAS_SHARED_LIB} ${op_impl_path}
+        DEPENDS extern_openblas)
+      list(APPEND OP_IMPL_DEPS ${op_impl_path}/openblas.dll)
     endif()
     if(WITH_MKLDNN)
-      add_custom_command(TARGET op_function_generator
-          PRE_LINK
-          COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_SHARED_LIB} ${op_function_generator_path}
-          )
+      ADD_CUSTOM_COMMAND(OUTPUT ${op_impl_path}/mkldnn.dll
+        COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_SHARED_LIB} ${op_impl_path}
+        DEPENDS mkldnn)
+        list(APPEND OP_IMPL_DEPS ${op_impl_path}/mkldnn.dll)
     endif()
+
+    add_custom_command(OUTPUT ${impl_file}
+      COMMAND ${CMAKE_BINARY_DIR}/paddle/fluid/pybind/op_function_generator_retry.bat
+      COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file} ${impl_file}
+      COMMENT "copy_if_different ${tmp_impl_file} to ${impl_file}"
+      DEPENDS ${OP_IMPL_DEPS})
   else(WIN32)
     # If there are no *.so in /usr/lib or LD_LIBRARY_PATH,
     # copy these *.so to current directory and append current directory to
     # LD_LIBRARY_PATH. This is different with Windows platformm, which search
     # *.dll in current directory automatically.
-    add_custom_command(TARGET op_function_generator
-          POST_BUILD
+    if(WITH_MKLML)
+      ADD_CUSTOM_COMMAND(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/libiomp5.so
+        COMMAND ${CMAKE_COMMAND} -E copy ${MKLML_SHARED_IOMP_LIB} ${CMAKE_CURRENT_BINARY_DIR}
+        DEPENDS mklml)
+      list(APPEND OP_IMPL_DEPS ${CMAKE_CURRENT_BINARY_DIR}/libiomp5.so)
+    endif()
+    if(WITH_MKLDNN)
+      ADD_CUSTOM_COMMAND(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/libdnnl.so.0
+        COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_SHARED_LIB} ${CMAKE_CURRENT_BINARY_DIR}
+        DEPENDS mkldnn)
+      list(APPEND OP_IMPL_DEPS ${CMAKE_CURRENT_BINARY_DIR}/libdnnl.so.0)
+    endif()
+    add_custom_command(OUTPUT ${impl_file}
           COMMAND ${CMAKE_COMMAND} -E env "LD_LIBRARY_PATH=$ENV{LD_LIBRARY_PATH}:."
               "${CMAKE_CURRENT_BINARY_DIR}/op_function_generator"
               "${tmp_impl_file}"
           COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file} ${impl_file}
           COMMENT "copy_if_different ${tmp_impl_file} to ${impl_file}"
-          VERBATIM
-    )
-    if(WITH_MKL)
-      add_custom_command(TARGET op_function_generator
-            PRE_LINK
-            COMMAND ${CMAKE_COMMAND} -E copy ${MKLML_SHARED_LIB} ${CMAKE_CURRENT_BINARY_DIR}
-            COMMAND ${CMAKE_COMMAND} -E copy ${MKLML_SHARED_IOMP_LIB} ${CMAKE_CURRENT_BINARY_DIR}
-            )
-    endif(WITH_MKL)
-    if(WITH_MKLDNN)
-      add_custom_command(TARGET op_function_generator
-          PRE_LINK
-          COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_SHARED_LIB} ${CMAKE_CURRENT_BINARY_DIR}
-          )
-    endif(WITH_MKLDNN)
+          DEPENDS ${OP_IMPL_DEPS}
+          VERBATIM)
   endif(WIN32)
+  add_custom_target(op_function_generator_cmd ALL DEPENDS ${impl_file})
 
   cc_library(paddle_pybind SHARED
-  SRCS ${PYBIND_SRCS}
-  DEPS ${PYBIND_DEPS}
-  ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS})
+    SRCS ${PYBIND_SRCS}
+    DEPS ${PYBIND_DEPS} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS})
+
   if(NOT APPLE AND NOT WIN32)
     target_link_libraries(paddle_pybind rt)
   endif(NOT APPLE AND NOT WIN32)
@@ -218,5 +214,5 @@ if(WITH_PYTHON)
 
   get_property (os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
   target_link_libraries(paddle_pybind ${os_dependency_modules})
-  add_dependencies(paddle_pybind op_function_generator)
+  add_dependencies(paddle_pybind op_function_generator_cmd)
 endif(WITH_PYTHON)
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index b58e9050402bb7..040ae26213f5f5 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -42,6 +42,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_rank_table.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/new_exec.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
@@ -1631,7 +1632,13 @@ All parameter, weight, gradient are variables in Paddle.
       .def("__repr__", string::to_string<const platform::XPUPlace &>)
       .def("__str__", string::to_string<const platform::XPUPlace &>);
 #ifdef PADDLE_WITH_XPU
+  py::enum_<platform::XPUVersion>(m, "XPUVersion", py::arithmetic())
+      .value("XPU1", platform::XPUVersion::XPU1)
+      .value("XPU2", platform::XPUVersion::XPU2)
+      .export_values();
   m.def("get_xpu_device_count", platform::GetXPUDeviceCount);
+  m.def("get_xpu_device_version",
+        [](int device_id) { return platform::get_xpu_version(device_id); });
 #endif
 
   py::class_<paddle::platform::CPUPlace>(m, "CPUPlace", R"DOC(
@@ -1935,6 +1942,34 @@ All parameter, weight, gradient are variables in Paddle.
                  fetch_vars);
       });
 
+  py::class_<framework::InterpreterCore>(m, "InterpreterCore")
+      .def(py::init<const platform::Place &, const ProgramDesc &,
+                    const ProgramDesc &, Scope *>())
+      .def("run",
+           [](InterpreterCore &self,
+              const std::unordered_map<std::string, py::array> &input_dict,
+              std::vector<std::string> vec_fetch_name) {
+             pybind11::gil_scoped_release release;
+             std::vector<framework::Tensor> vec_tensor;
+             std::vector<std::string> vec_name;
+
+             for (auto &item : input_dict) {
+               framework::LoDTensor t;
+               SetTensorFromPyArray<platform::CPUPlace>(
+                   &t, item.second, platform::CPUPlace(), false);
+               vec_name.push_back(item.first);
+               vec_tensor.push_back(t);
+             }
+
+             std::vector<framework::Tensor> vec_out;
+             self.run(vec_name, vec_tensor, vec_fetch_name, &vec_out);
+             std::vector<py::array> vec_ret;
+             for (size_t i = 0; i < vec_out.size(); ++i) {
+               vec_ret.push_back(TensorToPyArray(vec_out[i], true));
+             }
+             return vec_ret;
+           });
+
   m.def("init_gflags", framework::InitGflags);
   m.def("init_glog", framework::InitGLOG);
   m.def("load_op_meta_info_and_register_op",
diff --git a/paddle/fluid/string/CMakeLists.txt b/paddle/fluid/string/CMakeLists.txt
index a465f5909a7c6e..9667e18bc6a1e3 100644
--- a/paddle/fluid/string/CMakeLists.txt
+++ b/paddle/fluid/string/CMakeLists.txt
@@ -1,7 +1,8 @@
 cc_library(stringpiece SRCS piece.cc DEPS flags)
 cc_library(pretty_log SRCS pretty_log.cc DEPS flags)
-cc_library(string_helper SRCS string_helper.cc DEPS boost flags)
+cc_library(string_helper SRCS string_helper.cc DEPS flags)
 cc_test(stringpiece_test SRCS piece_test.cc DEPS stringpiece glog gflags)
 cc_test(stringprintf_test SRCS printf_test.cc DEPS glog gflags)
 cc_test(to_string_test SRCS to_string_test.cc)
 cc_test(split_test SRCS split_test.cc)
+cc_test(string_helper_test SRCS string_helper_test.cc DEPS string_helper)
diff --git a/paddle/fluid/string/string_helper.cc b/paddle/fluid/string/string_helper.cc
index 8731e8fca8a5c4..141ac2ba47c5b9 100644
--- a/paddle/fluid/string/string_helper.cc
+++ b/paddle/fluid/string/string_helper.cc
@@ -88,6 +88,11 @@ inline int str_to_float(const char* str, float* v) {
   return index;
 }
 
+bool ends_with(std::string const& input, std::string const& test) {
+  if (test.size() > input.size()) return false;
+  return std::equal(test.rbegin(), test.rend(), input.rbegin());
+}
+
 // A helper class for reading lines from file.
 // A line buffer is maintained. It
 // doesn't need to know the maximum possible length of a line.
@@ -100,7 +105,7 @@ char* LineFileReader::getdelim(FILE* f, char delim) {
       _buffer[--ret] = 0;
     }
 
-    _length = (size_t)ret;
+    _length = static_cast<size_t>(ret);
     return _buffer;
   } else {
     _length = 0;
diff --git a/paddle/fluid/string/string_helper.h b/paddle/fluid/string/string_helper.h
index f7387e877af2cd..37b713766dd558 100644
--- a/paddle/fluid/string/string_helper.h
+++ b/paddle/fluid/string/string_helper.h
@@ -21,7 +21,6 @@
 #include <utility>
 #include <vector>
 
-#include "boost/lexical_cast.hpp"
 #include "glog/logging.h"
 
 namespace paddle {
@@ -38,6 +37,7 @@ void format_string_append(std::string& str, const char* fmt,  // NOLINT
   CHECK_GE(len, 0);
   size_t oldlen = str.length();
   str.resize(oldlen + len + 1);
+
   CHECK(snprintf(&str[oldlen], (size_t)len + 1, fmt, args...) ==  // NOLINT
         len);
   str.resize(oldlen + len);
@@ -69,6 +69,9 @@ std::string erase_spaces(const std::string& str);
 
 int str_to_float(const char* str, float* v);
 
+// checks whether the test string is a suffix of the input string.
+bool ends_with(std::string const& input, std::string const& test);
+
 // split string by delim
 template <class T = std::string>
 std::vector<T> split_string(const std::string& str, const std::string& delim) {
@@ -134,7 +137,9 @@ std::string join_strings(const Container& strs, char delim) {
       str += delim;
     }
 
-    str += boost::lexical_cast<std::string>(elem);
+    std::stringstream ss;
+    ss << elem;
+    str += ss.str();
     ++i;
   }
 
@@ -151,7 +156,9 @@ std::string join_strings(const Container& strs, const std::string& delim) {
       str += delim;
     }
 
-    str += boost::lexical_cast<std::string>(elem);
+    std::stringstream ss;
+    ss << elem;
+    str += ss.str();
     ++i;
   }
 
diff --git a/paddle/fluid/string/string_helper_test.cc b/paddle/fluid/string/string_helper_test.cc
new file mode 100644
index 00000000000000..4796bf7507aba7
--- /dev/null
+++ b/paddle/fluid/string/string_helper_test.cc
@@ -0,0 +1,58 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/string/string_helper.h"
+
+#include <string>
+
+#include "gtest/gtest.h"
+
+TEST(StringHelper, EndsWith) {
+  std::string input("hello world");
+  std::string test1("world");
+  std::string test2("helloworld");
+  std::string test3("hello world hello world");
+
+  EXPECT_TRUE(paddle::string::ends_with(input, test1));
+  EXPECT_TRUE(paddle::string::ends_with(input, input));
+
+  EXPECT_FALSE(paddle::string::ends_with(input, test2));
+  EXPECT_FALSE(paddle::string::ends_with(input, test3));
+}
+
+TEST(StringHelper, FormatStringAppend) {
+  std::string str("hello");
+  char fmt[] = "%d";
+
+  paddle::string::format_string_append(str, fmt, 10);
+  EXPECT_EQ(str, "hello10");
+}
+
+TEST(StringHelper, JoinStrings) {
+  std::vector<std::string> v;
+  v.push_back("hello");
+  v.push_back("world");
+
+  std::string result = paddle::string::join_strings(v, ' ');
+  EXPECT_EQ(result, "hello world");
+
+  result = paddle::string::join_strings(v, '\n');
+  EXPECT_EQ(result, "hello\nworld");
+
+  result = paddle::string::join_strings(v, ',');
+  EXPECT_EQ(result, "hello,world");
+
+  result = paddle::string::join_strings(v, " new ");
+  EXPECT_EQ(result, "hello new world");
+}
diff --git a/paddle/utils/any.h b/paddle/utils/any.h
new file mode 100644
index 00000000000000..ec803647c11f7e
--- /dev/null
+++ b/paddle/utils/any.h
@@ -0,0 +1,232 @@
+//This file copy from boost/any.hpp and boost version: 1.41.0
+//Modified the following points:
+//1. modify namespace from boost::any to paddle::any
+//2. remove the depending boost header files
+//3. remove/modify some macro
+
+// See http://www.boost.org/libs/any for Documentation.
+
+#ifndef PADDLE_ANY_INCLUDED
+#define PADDLE_ANY_INCLUDED
+
+// what:  variant type boost::any
+// who:   contributed by Kevlin Henney,
+//        with features contributed and bugs found by
+//        Ed Brey, Mark Rodgers, Peter Dimov, and James Curran
+// when:  July 2001
+// where: tested with BCC 5.5, MSVC 6.0, and g++ 2.95
+
+#include <algorithm>
+#include <typeinfo>
+#include <type_traits>
+
+// See boost/python/type_id.hpp
+// TODO: add BOOST_TYPEID_COMPARE_BY_NAME to config.hpp
+# if (defined(__GNUC__) && __GNUC__ >= 3) \
+ || defined(_AIX) \
+ || (   defined(__sgi) && defined(__host_mips)) \
+ || (defined(__hpux) && defined(__HP_aCC)) \
+ || (defined(linux) && defined(__INTEL_COMPILER) && defined(__ICC))
+#  define BOOST_AUX_ANY_TYPE_ID_NAME
+#include <cstring>
+# endif 
+
+namespace paddle
+{
+    class any
+    {
+    public: // structors
+
+        any()
+          : content(0)
+        {
+        }
+
+        template<typename ValueType>
+        any(const ValueType & value)
+          : content(new holder<ValueType>(value))
+        {
+        }
+
+        any(const any & other)
+          : content(other.content ? other.content->clone() : 0)
+        {
+        }
+
+        ~any()
+        {
+            delete content;
+        }
+
+    public: // modifiers
+
+        any & swap(any & rhs)
+        {
+            std::swap(content, rhs.content);
+            return *this;
+        }
+
+        template<typename ValueType>
+        any & operator=(const ValueType & rhs)
+        {
+            any(rhs).swap(*this);
+            return *this;
+        }
+
+        any & operator=(any rhs)
+        {
+            rhs.swap(*this);
+            return *this;
+        }
+
+    public: // queries
+
+        bool empty() const
+        {
+            return !content;
+        }
+
+        const std::type_info & type() const
+        {
+            return content ? content->type() : typeid(void);
+        }
+
+    public: // types (public so any_cast can be non-friend)
+
+        class placeholder
+        {
+        public: // structors
+
+            virtual ~placeholder()
+            {
+            }
+
+        public: // queries
+
+            virtual const std::type_info & type() const = 0;
+
+            virtual placeholder * clone() const = 0;
+
+        };
+
+        template<typename ValueType>
+        class holder : public placeholder
+        {
+        public: // structors
+
+            holder(const ValueType & value)
+              : held(value)
+            {
+            }
+
+        public: // queries
+
+            virtual const std::type_info & type() const
+            {
+                return typeid(ValueType);
+            }
+
+            virtual placeholder * clone() const
+            {
+                return new holder(held);
+            }
+
+        public: // representation
+
+            ValueType held;
+
+        private: // intentionally left unimplemented
+            holder & operator=(const holder &);
+        };
+
+    public: // representation (public so any_cast can be non-friend)
+
+        placeholder * content;
+
+    };
+
+    class bad_any_cast : public std::bad_cast
+    {
+    public:
+        virtual const char * what() const throw()
+        {
+            return "paddle::bad_any_cast: "
+                   "failed conversion using paddle::any_cast";
+        }
+    };
+
+    template<typename ValueType>
+    ValueType * any_cast(any * operand)
+    {
+        return operand && 
+#ifdef BOOST_AUX_ANY_TYPE_ID_NAME
+            std::strcmp(operand->type().name(), typeid(ValueType).name()) == 0
+#else
+            operand->type() == typeid(ValueType)
+#endif
+            ? &static_cast<any::holder<ValueType> *>(operand->content)->held
+            : 0;
+    }
+
+    template<typename ValueType>
+    inline const ValueType * any_cast(const any * operand)
+    {
+        return any_cast<ValueType>(const_cast<any *>(operand));
+    }
+
+    template<typename ValueType>
+    ValueType any_cast(any & operand)
+    {
+        typedef typename std::remove_reference<ValueType>::type nonref;
+
+        // If 'nonref' is still reference type, it means the user has not
+        // specialized 'remove_reference'.
+
+        // Please use BOOST_BROKEN_COMPILER_TYPE_TRAITS_SPECIALIZATION macro
+        // to generate specialization of remove_reference for your class
+        // See type traits library documentation for details
+        static_assert(!std::is_reference<nonref>::value, "!std::is_reference<nonref>::value");
+
+        nonref * result = any_cast<nonref>(&operand);
+        if(!result)
+            throw bad_any_cast();
+        return *result;
+    }
+
+    template<typename ValueType>
+    inline ValueType any_cast(const any & operand)
+    {
+        typedef typename std::remove_reference<ValueType>::type nonref;
+
+        // The comment in the above version of 'any_cast' explains when this
+        // assert is fired and what to do.
+        static_assert(!std::is_reference<nonref>::value, "!std::is_reference<nonref>::value");
+
+        return any_cast<const nonref &>(const_cast<any &>(operand));
+    }
+
+    // Note: The "unsafe" versions of any_cast are not part of the
+    // public interface and may be removed at any time. They are
+    // required where we know what type is stored in the any and can't
+    // use typeid() comparison, e.g., when our types may travel across
+    // different shared libraries.
+    template<typename ValueType>
+    inline ValueType * unsafe_any_cast(any * operand)
+    {
+        return &static_cast<any::holder<ValueType> *>(operand->content)->held;
+    }
+
+    template<typename ValueType>
+    inline const ValueType * unsafe_any_cast(const any * operand)
+    {
+        return unsafe_any_cast<ValueType>(const_cast<any *>(operand));
+    }
+}
+
+// Copyright Kevlin Henney, 2000, 2001, 2002. All rights reserved.
+//
+// Distributed under the Boost Software License, Version 1.0. (See
+// accompanying file LICENSE_1_0.txt or copy at
+// http://www.boost.org/LICENSE_1_0.txt)
+
+#endif
diff --git a/python/paddle/autograd/backward_mode.py b/python/paddle/autograd/backward_mode.py
index 6efbe777d537ca..36ca048c51210f 100644
--- a/python/paddle/autograd/backward_mode.py
+++ b/python/paddle/autograd/backward_mode.py
@@ -14,6 +14,7 @@
 
 from paddle.fluid import core
 from paddle.fluid import framework
+from paddle.fluid.backward import gradients_with_optimizer
 import paddle
 __all__ = []
 
diff --git a/python/paddle/distributed/elastic.py b/python/paddle/distributed/elastic.py
index 3e4fea5e6f34d7..e6f21f6603d8da 100644
--- a/python/paddle/distributed/elastic.py
+++ b/python/paddle/distributed/elastic.py
@@ -37,6 +37,9 @@ def scale_np(self, np):
             return True
         return False
 
+    def clean(self):
+        self.etcd.delete_prefix(self.prefix)
+
     def close(self):
         self.etcd.close()
 
@@ -53,13 +56,6 @@ def close(self):
     args = parser.parse_args()
 
     server = args.elastic_server or os.getenv('PADDLE_ELASTIC_SERVER')
-    # compatible with kuberntes service discovery
-    if not server and os.getenv(
-            'PADDLE_ELASTIC_ETCD_SERVICE_HOST') and os.getenv(
-                'PADDLE_ELASTIC_ETCD_SERVICE_PORT'):
-        server = '{}:{}'.format(
-            os.getenv('PADDLE_ELASTIC_ETCD_SERVICE_HOST'),
-            os.getenv('PADDLE_ELASTIC_ETCD_SERVICE_PORT'))
     name = args.job_id or os.getenv('PADDLE_ELASTIC_JOB_ID')
 
     np = args.np or int(os.getenv('PADDLE_ELASTIC_NP', 0))
@@ -69,6 +65,9 @@ def close(self):
     if args.action == "scale":
         cmd.scale_np(np)
 
+    if args.action == "clean":
+        cmd.clean()
+
     print("action {} done".format(args.action))
 
     cmd.close()
diff --git a/python/paddle/distributed/fleet/elastic/__init__.py b/python/paddle/distributed/fleet/elastic/__init__.py
new file mode 100644
index 00000000000000..1ac81729d5430a
--- /dev/null
+++ b/python/paddle/distributed/fleet/elastic/__init__.py
@@ -0,0 +1,74 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import signal
+import os, sys
+
+from .manager import ElasticManager
+from .manager import ElasticStatus
+from .manager import ELASTIC_EXIT_CODE
+from .collective import CollectiveLauncher
+
+from paddle.distributed.fleet.launch_utils import DistributeMode
+
+
+def enable_elastic(args, distribute_mode):
+    if distribute_mode != DistributeMode.COLLECTIVE:
+        return False
+
+    if not args.elastic_server and not os.getenv('PADDLE_ELASTIC_SERVER'):
+        return False
+
+    if not args.job_id and not os.getenv('PADDLE_ELASTIC_JOB_ID'):
+        return False
+
+    if not args.np and not int(os.getenv('PADDLE_ELASTIC_NP', 0)):
+        return False
+
+    return True
+
+
+def launch_elastic(args, distribute_mode):
+
+    elastic = ElasticManager(args)
+
+    signal.signal(signal.SIGTERM, elastic.signal_handler)
+    signal.signal(signal.SIGABRT, elastic.signal_handler)
+    signal.signal(signal.SIGINT, elastic.signal_handler)
+
+    while True:
+
+        # wait for all nodes ready to run
+        elastic.wait()
+
+        # run self with specified launcher
+        elastic.run(CollectiveLauncher)
+
+        # keep wathing the health status of self and being notified for other's failure
+        ret = elastic.watch()
+        if ret == ElasticStatus.COMPLETED:
+            break
+        if ret == ElasticStatus.HOLD:
+            continue
+        if ret == ElasticStatus.EXIT:
+            break
+        if ret == ElasticStatus.ERROR:
+            sys.exit(3)
+        if ret == ElasticStatus.RESTART:
+            sys.exit(ELASTIC_EXIT_CODE)
+
+    if int(elastic.sigint) > 0:
+        sys.exit(128 + int(elastic.sigint))
+    else:
+        sys.exit(0)
diff --git a/python/paddle/distributed/fleet/elastic/collective.py b/python/paddle/distributed/fleet/elastic/collective.py
new file mode 100644
index 00000000000000..94fe6a54b5809b
--- /dev/null
+++ b/python/paddle/distributed/fleet/elastic/collective.py
@@ -0,0 +1,93 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.distributed.fleet import launch_utils
+import paddle.distributed.fleet.cloud_utils as cloud_utils
+import paddle.distributed.fleet.ascend_utils as ascend_utils
+
+from paddle.distributed.fleet.launch_utils import *
+
+from paddle.distributed.fleet.elastic.manager import LauncherInterface
+
+
+class CollectiveLauncher(LauncherInterface):
+    def __init__(self, args):
+        self.args = args
+        self.procs = []
+
+    def launch(self):
+        logger.info("collective lauchner launch ...")
+        args = self.args
+        # parse arguments, used for cloud-single-machine and local
+        (device_mode,
+         devices_per_proc) = launch_utils.get_device_proc_info(args)
+        trainers_num = cloud_utils.get_trainers_num()
+        logger.debug("parsed from args trainerss_num:{} mode:{} devices:{}".
+                     format(trainers_num, device_mode, devices_per_proc))
+
+        cluster = None
+        pod = None
+
+        start_port = 6170
+        if os.environ.get('FLAGS_START_PORT') is not None:
+            start_port = os.environ.get('FLAGS_START_PORT')
+        if cloud_utils.use_paddlecloud() and trainers_num != 1:
+            cluster, pod = cloud_utils.get_cloud_cluster(
+                args.ips, device_mode, devices_per_proc, start_port)
+            logger.debug("get cluster from cloud:{}".format(cluster))
+        elif device_mode == DeviceMode.ASCEND_NPU:
+            # for ascend
+            cluster, pod = ascend_utils.get_cloud_cluster(
+                rank_table_file=os.getenv("RANK_TABLE_FILE", None),
+                device_mode=device_mode,
+                start_port=start_port)
+        else:
+            # trainers_num = 1 or not use paddlecloud ips="a,b"
+            cluster, pod = paddle.distributed.fleet.launch.get_cluster_from_args(
+                args, device_mode, devices_per_proc)
+            logger.debug("get cluster from args:{}".format(cluster))
+
+        global_envs = copy.copy(os.environ.copy())
+        self.gloo_rendezvous_dir = tempfile.mkdtemp()
+        # add gloo env
+        global_envs["PADDLE_WITH_GLOO"] = str(
+            os.getenv("PADDLE_WITH_GLOO", "0"))
+        global_envs["PADDLE_GLOO_RENDEZVOUS"] = "3"
+        global_envs["PADDLE_GLOO_FS_PATH"] = self.gloo_rendezvous_dir
+
+        self.procs = start_local_trainers(
+            cluster,
+            pod,
+            training_script=args.training_script,
+            training_script_args=args.training_script_args,
+            log_dir=args.log_dir,
+            envs=global_envs)
+
+        for idx, proc in enumerate(self.procs):
+            logger.info("launch proc_id:{} idx:{}".format(proc.proc.pid, idx))
+
+    def stop(self):
+        logger.info("collective lauchner stop ...")
+        if not self._terminate_procs():
+            logger.error("kill process failed")
+        if os.path.exists(self.gloo_rendezvous_dir):
+            shutil.rmtree(self.gloo_rendezvous_dir)
+
+    def watch(self):
+        logger.debug("collective lauchner watch ...")
+        for p in self.procs:
+            if p.log_fn and p.local_rank == 0:
+                pull_worker_log(p)
+        ret = self._check_procs()
+        return ret
diff --git a/python/paddle/distributed/fleet/elastic.py b/python/paddle/distributed/fleet/elastic/manager.py
similarity index 100%
rename from python/paddle/distributed/fleet/elastic.py
rename to python/paddle/distributed/fleet/elastic/manager.py
diff --git a/python/paddle/distributed/fleet/launch.py b/python/paddle/distributed/fleet/launch.py
index f407892e79acf6..bc7942826e1eaa 100644
--- a/python/paddle/distributed/fleet/launch.py
+++ b/python/paddle/distributed/fleet/launch.py
@@ -69,17 +69,13 @@
 import paddle
 import paddle.fluid as fluid
 from paddle.distributed.fleet import launch_utils
-import signal
 
 # TODO(danleifeng): Don't import * from a module
 from paddle.distributed.fleet.launch_utils import *
 import paddle.distributed.fleet.cloud_utils as cloud_utils
 import paddle.distributed.fleet.ascend_utils as ascend_utils
 
-from paddle.distributed.fleet.elastic import ElasticManager
-from paddle.distributed.fleet.elastic import LauncherInterface
-from paddle.distributed.fleet.elastic import ElasticStatus
-from paddle.distributed.fleet.elastic import ELASTIC_EXIT_CODE
+from paddle.distributed.fleet.elastic import enable_elastic, launch_elastic
 
 __all__ = []
 
@@ -235,76 +231,71 @@ def get_cluster_from_args(args, device_mode, devices_per_proc):
                        devices_per_proc)
 
 
-class CollectiveLauncher(LauncherInterface):
-    def __init__(self, args):
-        self.args = args
-        self.procs = []
+def launch_collective(args):
+    # parse arguments, used for cloud-single-machine and local
+    (device_mode, devices_per_proc) = launch_utils.get_device_proc_info(args)
+    trainers_num = cloud_utils.get_trainers_num()
+    logger.debug("parsed from args trainerss_num:{} mode:{} devices:{}".format(
+        trainers_num, device_mode, devices_per_proc))
+
+    cluster = None
+    pod = None
+
+    start_port = 6170
+    if os.environ.get('FLAGS_START_PORT') is not None:
+        start_port = os.environ.get('FLAGS_START_PORT')
+    if cloud_utils.use_paddlecloud() and trainers_num != 1:
+        cluster, pod = cloud_utils.get_cloud_cluster(
+            args.ips, device_mode, devices_per_proc, start_port)
+        logger.debug("get cluster from cloud:{}".format(cluster))
+    elif device_mode == DeviceMode.ASCEND_NPU:
+        # for ascend
+        cluster, pod = ascend_utils.get_cloud_cluster(
+            rank_table_file=os.getenv("RANK_TABLE_FILE", None),
+            device_mode=device_mode,
+            start_port=start_port)
+    else:
+        # trainers_num = 1 or not use paddlecloud ips="a,b"
+        cluster, pod = get_cluster_from_args(args, device_mode,
+                                             devices_per_proc)
+        logger.debug("get cluster from args:{}".format(cluster))
+
+    global_envs = copy.copy(os.environ.copy())
+    gloo_rendezvous_dir = tempfile.mkdtemp()
+    # add gloo env
+    global_envs["PADDLE_WITH_GLOO"] = str(os.getenv("PADDLE_WITH_GLOO", "0"))
+    global_envs["PADDLE_GLOO_RENDEZVOUS"] = "3"
+    global_envs["PADDLE_GLOO_FS_PATH"] = gloo_rendezvous_dir
+
+    procs = start_local_trainers(
+        cluster,
+        pod,
+        training_script=args.training_script,
+        training_script_args=args.training_script_args,
+        log_dir=args.log_dir,
+        envs=global_envs)
+
+    for idx, proc in enumerate(procs):
+        print("launch proc_id:{} idx:{}".format(proc.proc.pid, idx))
 
-    def launch(self):
-        logger.info("collective lauchner launch ...")
-        args = self.args
-        # parse arguments, used for cloud-single-machine and local
-        (device_mode,
-         devices_per_proc) = launch_utils.get_device_proc_info(args)
-        trainers_num = cloud_utils.get_trainers_num()
-        logger.debug("parsed from args trainerss_num:{} mode:{} devices:{}".
-                     format(trainers_num, device_mode, devices_per_proc))
+    while True:
+        try:
+            alive = watch_local_trainers(procs, cluster.trainers_nranks())
 
-        cluster = None
-        pod = None
+            if not alive:
+                logger.info("Local processes completed.")
+                logger.debug("POD info:{}".format(pod))
+                break
 
-        start_port = 6170
-        if os.environ.get('FLAGS_START_PORT') is not None:
-            start_port = os.environ.get('FLAGS_START_PORT')
-        if cloud_utils.use_paddlecloud() and trainers_num != 1:
-            cluster, pod = cloud_utils.get_cloud_cluster(
-                args.ips, device_mode, devices_per_proc, start_port)
-            logger.debug("get cluster from cloud:{}".format(cluster))
-        elif device_mode == DeviceMode.ASCEND_NPU:
-            # for ascend
-            cluster, pod = ascend_utils.get_cloud_cluster(
-                rank_table_file=os.getenv("RANK_TABLE_FILE", None),
-                device_mode=device_mode,
-                start_port=start_port)
-        else:
-            # trainers_num = 1 or not use paddlecloud ips="a,b"
-            cluster, pod = get_cluster_from_args(args, device_mode,
-                                                 devices_per_proc)
-            logger.debug("get cluster from args:{}".format(cluster))
-
-        global_envs = copy.copy(os.environ.copy())
-        self.gloo_rendezvous_dir = tempfile.mkdtemp()
-        # add gloo env
-        global_envs["PADDLE_WITH_GLOO"] = str(
-            os.getenv("PADDLE_WITH_GLOO", "0"))
-        global_envs["PADDLE_GLOO_RENDEZVOUS"] = "3"
-        global_envs["PADDLE_GLOO_FS_PATH"] = self.gloo_rendezvous_dir
-
-        self.procs = start_local_trainers(
-            cluster,
-            pod,
-            training_script=args.training_script,
-            training_script_args=args.training_script_args,
-            log_dir=args.log_dir,
-            envs=global_envs)
-
-        for idx, proc in enumerate(self.procs):
-            logger.info("launch proc_id:{} idx:{}".format(proc.proc.pid, idx))
-
-    def stop(self):
-        logger.info("collective lauchner stop ...")
-        if not self._terminate_procs():
-            logger.error("kill process failed")
-        if os.path.exists(self.gloo_rendezvous_dir):
-            shutil.rmtree(self.gloo_rendezvous_dir)
-
-    def watch(self):
-        logger.debug("collective lauchner watch ...")
-        for p in self.procs:
-            if p.log_fn and p.local_rank == 0:
-                pull_worker_log(p)
-        ret = self._check_procs()
-        return ret
+            time.sleep(3)
+
+        except:
+            logger.warning("Terminating... exit")
+            terminate_local_procs(procs)
+            exit(1)
+
+    if os.path.exists(gloo_rendezvous_dir):
+        shutil.rmtree(gloo_rendezvous_dir)
 
 
 def launch_ps(args, distribute_mode):
@@ -399,42 +390,15 @@ def launch():
     _print_arguments(args)
 
     distribute_mode = which_distributed_mode(args)
-    # TODO(kuizhiqing) support ps later
-    if not distribute_mode == DistributeMode.COLLECTIVE:
-        launch_ps(args, distribute_mode)
-        return
-
-    elastic = ElasticManager(args)
-
-    signal.signal(signal.SIGTERM, elastic.signal_handler)
-    signal.signal(signal.SIGABRT, elastic.signal_handler)
-    signal.signal(signal.SIGINT, elastic.signal_handler)
 
-    while True:
+    if enable_elastic(args, distribute_mode):
+        launch_elastic(args, distribute_mode)
+        return
 
-        # wait for all nodes ready to run
-        elastic.wait()
-
-        # run self with specified launcher
-        elastic.run(CollectiveLauncher)
-
-        # keep wathing the health status of self and being notified for other's failure
-        ret = elastic.watch()
-        if ret == ElasticStatus.COMPLETED:
-            break
-        if ret == ElasticStatus.HOLD:
-            continue
-        if ret == ElasticStatus.EXIT:
-            break
-        if ret == ElasticStatus.ERROR:
-            sys.exit(3)
-        if ret == ElasticStatus.RESTART:
-            sys.exit(ELASTIC_EXIT_CODE)
-
-    if int(elastic.sigint) > 0:
-        sys.exit(128 + int(elastic.sigint))
+    if distribute_mode == DistributeMode.COLLECTIVE:
+        launch_collective(args)
     else:
-        sys.exit(0)
+        launch_ps(args, distribute_mode)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/distributed/fleet/launch_utils.py b/python/paddle/distributed/fleet/launch_utils.py
index 6ead643df6c1b8..e114670440c065 100644
--- a/python/paddle/distributed/fleet/launch_utils.py
+++ b/python/paddle/distributed/fleet/launch_utils.py
@@ -307,6 +307,17 @@ def get_cluster(node_ips, node_ip, trainer_endpoints, device_mode,
 
 
 def terminate_local_procs(procs):
+    # try to terminate process by group, this happend in multiprocess senario in user process
+    if os.name != 'nt':
+        for p in procs:
+            if p.proc.poll() is None:
+                os.killpg(os.getpgid(p.proc.pid), signal.SIGTERM)
+                if p.log_fn:
+                    p.log_fn.close()
+                logger.info("terminate process group gid:{}".format(p.proc.pid))
+
+        time.sleep(1)
+
     for p in procs:
         if p.proc.poll() is None:
             p.proc.terminate()
@@ -583,19 +594,19 @@ def watch_local_trainers(procs, nranks):
     except KeyboardInterrupt:
         logger.warning("KeyboardInterrupt, exit")
         terminate_local_procs(procs)
-        raise
+        return
     except SystemExit:
         logger.error(
             "ABORT!!! Out of all {} trainers, the trainer process with rank={} was aborted. Please check its log.".
             format(nranks, error_rank))
         terminate_local_procs(procs)
-        raise
+        return
     except:
         logger.error(
             "ABORT!!! Out of all {} trainers, the trainer process with rank={} was aborted. Please check its log.".
             format(nranks, error_rank))
         terminate_local_procs(procs)
-        raise
+        return
 
     return alive
 
diff --git a/python/paddle/distributed/fleet/meta_optimizers/raw_program_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/raw_program_optimizer.py
index 2205f79ef4633f..c923624651c6ae 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/raw_program_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/raw_program_optimizer.py
@@ -217,9 +217,13 @@ def _allreduce_fusion_program(self):
         block = self.main_program.global_block()
         ring_id = self.global_ring_id
         param_grads = []
+        first_backward_idx = -1
 
         # find all grad params
-        for op in reversed(block.ops):
+        for idx, op in enumerate(block.ops):
+            if first_backward_idx == -1 and \
+                    is_backward_op(op):
+                first_backward_idx = idx
             if is_backward_op(op) and \
                     OP_ROLE_VAR_KEY in op.attr_names:
                 op_role_var = op.attr(OP_ROLE_VAR_KEY)
@@ -234,70 +238,100 @@ def _allreduce_fusion_program(self):
                     grad = block.var(grad_name)
                     if param.is_distributed:
                         continue
-                    param_grads.append(grad)
+                    param_grads.append((param, grad))
+
+        outputs_name_to_idx = self.__get_ouputs_name_to_idx(first_backward_idx,
+                                                            block)
 
-        segments = []
+        # structure of grad_param_segments is
+        # [([grad0, grad1], [param0, param1]), ([grad2, grad3], [param2, param3])]
+        # each entry of the list is a tuple stores the grads segment list and
+        # the corresponding params segment list
+        grad_param_segments = []
         last_dtype = None
         # split the grad based on dtype and fused size
-        for var in param_grads:
-            if len(segments) == 0 \
-                    or len(segments[-1]) == self.fuse_grad_size_in_num \
-                    or var.dtype != last_dtype:
-                segments.append([var])
-                last_dtype = var.dtype
+        for param, grad in param_grads:
+            if len(grad_param_segments) == 0 \
+                    or len(grad_param_segments[-1][0]) == self.fuse_grad_size_in_num \
+                    or grad.dtype != last_dtype:
+                grad_param_segments.append(([grad], [param]))
+                last_dtype = grad.dtype
             else:
-                segments[-1].append(var)
+                grad_param_segments[-1][0].append(grad)
+                grad_param_segments[-1][1].append(param)
 
-        fused_vars = []
-        for idx, op in enumerate(block.ops):
-            if is_optimizer_op(op):
-                for segment in segments:
-                    # insert coalesce tensor
-                    tmp_var = block.create_var(
-                        name=unique_name.generate('FusedOutput_{}'.format(
-                            segment[0].name)),
-                        dtype=segment[0].dtype,
-                        persistable=True,
-                        stop_gradient=True)
-                    fused_vars.append(tmp_var)
-                    block._insert_op_without_sync(
-                        idx,
-                        type="coalesce_tensor",
-                        inputs={"Input": segment},
-                        outputs={"Output": segment,
-                                 "FusedOutput": tmp_var},
-                        attrs={
-                            "copy_data": True,
-                            "use_align": True,
-                            "dtype": segment[0].dtype,
-                            OP_ROLE_KEY: OpRole.Backward
-                        })
-                break
+        if len(grad_param_segments) == 0:
+            return
 
-        # insert the allreduce_sum op
-        for idx, op in enumerate(block.ops):
-            if is_optimizer_op(op):
-                for fused_var in fused_vars:
-                    block._insert_op_without_sync(
-                        idx,
-                        type='c_allreduce_sum',
-                        inputs={'X': fused_var},
-                        outputs={'Out': fused_var},
-                        attrs={
-                            'ring_id': ring_id,
-                            'use_calc_stream': self.calc_comm_same_stream,
-                            OP_ROLE_KEY: OpRole.Backward
-                        })
-                    if not self.calc_comm_same_stream:
-                        block._insert_op_without_sync(
-                            idx,
-                            type='c_sync_calc_stream',
-                            inputs={'X': fused_var},
-                            outputs={'Out': fused_var},
-                            attrs={OP_ROLE_KEY: OpRole.Backward})
-                break
+        fused_vars = [None] * len(grad_param_segments)
+        for i in range(len(grad_param_segments) - 1, -1, -1):
+            # travers the grad_param_segments in backward
+            # not to use reversed since needs the absolute index value
+            grad_segment, param_segment = grad_param_segments[i]
+            # insert coalesce tensor
+            fused_var = block.create_var(
+                name=unique_name.generate('FusedOutput_{}'.format(grad_segment[
+                    0].name)),
+                dtype=grad_segment[0].dtype,
+                persistable=False,
+                stop_gradient=True)
+            fused_vars[i] = fused_var
+            after_idx = outputs_name_to_idx[grad_segment[-1]][1]
+            block._insert_op_without_sync(
+                after_idx + 1,
+                type='c_allreduce_sum',
+                inputs={'X': fused_var},
+                outputs={'Out': fused_var},
+                attrs={
+                    'ring_id': ring_id,
+                    'use_calc_stream': self.calc_comm_same_stream,
+                    OP_ROLE_KEY: OpRole.Backward
+                })
+            if not self.calc_comm_same_stream:
+                block._insert_op_without_sync(
+                    after_idx + 1,
+                    type='c_sync_calc_stream',
+                    inputs={'X': fused_var},
+                    outputs={'Out': fused_var},
+                    attrs={OP_ROLE_KEY: OpRole.Backward})
 
-        if len(fused_vars) == 0:
+        # update the outputs_name_to_idx after insertion of sync/allreduce ops
+        outputs_name_to_idx = self.__get_ouputs_name_to_idx(first_backward_idx,
+                                                            block)
+        # the before_idx is not guaranteed sorted, therefore we have to find the
+        # topology to insert the coalesce ops
+        pos_for_coalesce = {}
+        for i in range(len(grad_param_segments) - 1, -1, -1):
+            # We separate the insertion of coalesce op and the insertion of sync/allreduce op,
+            # since that the coalesce op's insertion may invalidate the outputs_name_to_idx
+            grad_segment, param_segment = grad_param_segments[i]
+            before_idx = len(block.ops)
+            for grad in outputs_name_to_idx:
+                before_idx = min(before_idx, outputs_name_to_idx[grad][0])
+            pos_for_coalesce[i] = before_idx
+
+        # insert the coalesce op based on the sorted before_idx
+        pos_for_coalesce = sorted(
+            pos_for_coalesce.items(),
+            key=lambda kv: (kv[1], kv[0]),
+            reverse=True)
+        for i, before_idx in pos_for_coalesce:
+            grad_segment, param_segment = grad_param_segments[i]
+            fused_var = fused_vars[i]
+            block._insert_op_without_sync(
+                before_idx,
+                type="coalesce_tensor",
+                inputs={"Input": param_segment},
+                outputs={"Output": grad_segment,
+                         "FusedOutput": fused_var},
+                attrs={
+                    "copy_data": False,
+                    "use_align": True,
+                    "dtype": grad_segment[0].dtype,
+                    OP_ROLE_KEY: OpRole.Backward
+                })
+
+        if self.calc_comm_same_stream:
             block._sync_with_cpp()
             return
 
@@ -307,9 +341,31 @@ def _allreduce_fusion_program(self):
                 block._insert_op_without_sync(
                     idx,
                     type='c_sync_comm_stream',
-                    inputs={'X': fused_vars[0]},
-                    outputs={'Out': fused_vars[0]},
+                    inputs={'X': grad_segment[0]},
+                    outputs={'Out': grad_segment[0]},
                     attrs={'ring_id': ring_id,
                            OP_ROLE_KEY: OpRole.Backward})
                 break
         block._sync_with_cpp()
+
+    def __get_ouputs_name_to_idx(self, first_backward_idx, block):
+        # Each item of outputs_name_to_idx is a pair of idx.
+        # The first entry of this pair is the idx of the first op generates the grad,
+        # which is used to indicate the position to insert coalesce op.
+        # The second entry of this pair is the idx of the last op generates the grad,
+        # which is used to indicate the position to insert sync and allreduce op.
+        outputs_name_to_idx = {}
+        for idx in range(first_backward_idx, len(block.ops)):
+            op = block.ops[idx]
+            if is_optimizer_op(op):
+                break
+            for name in op.output_arg_names:
+                var = block.var(name)
+                if not outputs_name_to_idx.get(var):
+                    # if the grad only be generated by one op
+                    # the first idx and the last ids are identical
+                    outputs_name_to_idx[var] = (idx, idx)
+                else:
+                    outputs_name_to_idx[var] = (outputs_name_to_idx[var][0],
+                                                idx)
+        return outputs_name_to_idx
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
index df775247c8c9e5..a5df9486da4656 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
@@ -84,27 +84,23 @@ def _enable_strategy(self, dist_strategy, context):
         dist_strategy.sharding = True
         dist_strategy.sharding_configs = {"segment_broadcast_MB": 32}
 
-    def minimize_impl(self,
-                      loss,
-                      startup_program=None,
-                      parameter_list=None,
-                      no_grad_set=None):
-        # TODO: (JZ-LIANG) support multiple comm in future
-        # self._nrings = self.user_defined_strategy.nccl_comm_num
-        self._nrings_sharding = 1
-        self._nrings_dp = 1
+    def _get_sharding_segment_strategy(self):
+        """ get
+        self._sharding_segment_strategy
+        1. if by_size:    self._broadcast_MB
+        2. if by_anchors: self._sharding_segment_anchors
+                          self._backward_remain_anchors
+                          self._forward_remain_anchors
+        """
+        strategy = self.user_defined_strategy
+        sharding_configs = strategy.sharding_configs
+        segment_strategy = str(sharding_configs["sharding_segment_strategy"])
 
-        # segment
-        self._sharding_segment_strategy = str(
-            self.user_defined_strategy.sharding_configs[
-                "sharding_segment_strategy"])
-        if self._sharding_segment_strategy == "segment_broadcast_MB":
-            self._broadcast_MB = self.user_defined_strategy.sharding_configs[
-                "segment_broadcast_MB"]
+        if segment_strategy == "segment_broadcast_MB":
+            self._broadcast_MB = sharding_configs["segment_broadcast_MB"]
             assert self._broadcast_MB > 0, "segment size should larger than zero !"
-        elif self._sharding_segment_strategy == "segment_anchors":
-            self._sharding_segment_anchors = self.user_defined_strategy.sharding_configs[
-                "segment_anchors"]
+        elif segment_strategy == "segment_anchors":
+            self._sharding_segment_anchors = sharding_configs["segment_anchors"]
             assert len(self._sharding_segment_anchors
                        ) > 0, "you should set the sharding segment anchors !"
             self._backward_remain_anchors = self._sharding_segment_anchors[:]
@@ -112,82 +108,104 @@ def minimize_impl(self,
         else:
             raise NotImplementedError(
                 "the sharding segment strategy [{}] is not implemented".format(
-                    str(self._sharding_segment_strategy)))
+                    str(segment_strategy)))
+        self._sharding_segment_strategy = segment_strategy
+
+    def _get_hybrid_degree(self):
+        """ get
+        self.hybrid_dp
+        self.sharding_degree
+        self.mp_degree
+        self.pp_degree
+        self.dp_degree
+        """
+        strategy = self.user_defined_strategy
+        sharding_configs = strategy.sharding_configs
 
         # parallelism
-        self.sharding_degree = int(self.user_defined_strategy.sharding_configs[
-            "sharding_degree"])
-        assert self.sharding_degree > 0, "sharding degree must be larger than zero"
-        self.mp_degree = int(self.user_defined_strategy.sharding_configs[
-            "mp_degree"])
+        sharding_degree = int(sharding_configs["sharding_degree"])
+        mp_degree = int(sharding_configs["mp_degree"])
+        pp_degree = int(sharding_configs["pp_degree"])
+        dp_degree = int(sharding_configs['dp_degree'])
+        global_world_size = self.role_maker._worker_num()
+
+        assert sharding_degree > 0, "sharding degree must be larger than zero"
         # pipeline setting
         # TODO (JZ-LIANG) should revise here for support mix parallelism with pipeline
-        self.pp_degree = int(self.user_defined_strategy.sharding_configs[
-            "pp_degree"])
-        if self.pp_degree > 1:
-            assert self.user_defined_strategy.pipeline == True
-
-        self.dp_degree = int(self.user_defined_strategy.sharding_configs[
-            'dp_degree'])
-        assert self.role_maker._worker_num(
-        ) == self.mp_degree * self.sharding_degree * self.pp_degree * self.dp_degree, "global work size [{}], mp_degree [{}], sharding_degree [{}], pp_degree [{}], dp_degree [{}].".format(
-            self.role_maker._worker_num(),
-            self.mp_degree,
-            self.sharding_degree,
-            self.pp_degree,
-            self.dp_degree, )
+        if pp_degree > 1:
+            assert strategy.pipeline is True
+
+        assert global_world_size == mp_degree * sharding_degree * pp_degree * dp_degree, \
+            "global work size [{}], mp_degree [{}], sharding_degree [{}], pp_degree [{}], dp_degree [{}].".format(
+                global_world_size, mp_degree, sharding_degree, pp_degree, dp_degree)
 
         # FIXME (JZ-LIANG) deprecated hybrid_dp
-        if self.user_defined_strategy.sharding_configs["hybrid_dp"]:
+        if sharding_configs["hybrid_dp"]:
             logger.warning(
-                "[hybrid_dp] API setting is deprecated. Now when dp_degree >= 2, its will be in hybrid dp mode automatically"
-            )
-            assert self.dp_degree >= 1
-        if self.dp_degree > 1:
-            self.hybrid_dp = True
-        else:
-            self.hybrid_dp = False
-
-        # NOTE (JZ-LIANG) 
-        # there 2 kind of modes for gradient-merge and hybrid-dp in mixed parallism [sharding] and [pipeline].
-        # we distinguish this two modes since the gm/hybrid-dp related allreduce should be insert in different place according different mode to have best performance:
-        # sharding: communication within node, and therefore should insert within backward segment to overlap with bw calc, conduct every micro step 
-        # pipeline: communication accross nodes, and therefore should insert in update segemnt, conduct just once per global step        
-        self.hybrid_dp_mode = None
+                "[hybrid_dp] API setting is deprecated. Now when "
+                "dp_degree >= 2, its will be in hybrid dp mode automatically")
+            assert dp_degree >= 1
+
+        self.hybrid_dp = True if dp_degree > 1 else False
+        self.sharding_degree = sharding_degree
+        self.mp_degree = mp_degree
+        self.pp_degree = pp_degree
+        self.dp_degree = dp_degree
+
+    def _get_hybrid_dp_mode(self):
+        """ get
+        self.hybrid_dp_mode
+        self.gradient_merge_mode
+        self._gradient_merge_acc_step
+        self.pp_allreduce_in_optimize
+        """
+        strategy = self.user_defined_strategy
+        sharding_configs = strategy.sharding_configs
+
+        # NOTE (JZ-LIANG)
+        # There 2 kind of modes for gradient-merge and hybrid-dp in mixed parallelism [sharding] and [pipeline].
+        # We distinguish this two modes since the gm/hybrid-dp related allreduce should be insert in different place
+        # according different mode to have best performance:
+        # sharding: communication within node, and therefore should insert within backward segment
+        #           to overlap with bw calc, conduct every micro step.
+        # pipeline: communication across nodes, and therefore should insert in update segment,
+        #           conduct just once per global step.
+        dp_mode = None
         # dp here is the pure dp as the outest parallelism
         if self.hybrid_dp:
-            assert self.dp_degree > 1, "hybrid dp is on, but dp degree is [{}]".format(
-                self.dp_degree)
             if self.pp_degree > 1:
-                self.hybrid_dp_mode = "pp_hybrid_dp"
+                dp_mode = "pp_hybrid_dp"
             else:
-                assert self.sharding_degree > 1, "by now we only support five kind of hybrid dp: sharding_hybrid_dp, mp_sharding_hybrid_dp, pp_hybrid_dp, mp_sharding_pp_hybrid_dp, sharding_pp_hybrid_dp."
-                self.hybrid_dp_mode = "sharding_hybrid_dp"
+                assert self.sharding_degree > 1, \
+                    "by now we only support five kind of hybrid dp: sharding_hybrid_dp, " \
+                    "mp_sharding_hybrid_dp, pp_hybrid_dp, mp_sharding_pp_hybrid_dp, sharding_pp_hybrid_dp."
+                dp_mode = "sharding_hybrid_dp"
 
         # gradient merge
-        self._gradient_merge_acc_step = int(
-            self.user_defined_strategy.sharding_configs[
-                "gradient_merge_acc_step"])
-        self.gradient_merge_mode = None
+        gm_mode = None
+        gm_acc_step = int(sharding_configs["gradient_merge_acc_step"])
         if self.pp_degree <= 1:
-            self.gradient_merge_mode = "sharding_gm"
+            gm_mode = "sharding_gm"
             self._grad2merged_grad = dict()
         else:
-            self.gradient_merge_mode = "pp_gm"
-            self._gradient_merge_acc_step = self.user_defined_strategy.pipeline_configs[
-                'accumulate_steps']
-        if self._gradient_merge_acc_step > 1:
+            gm_mode = "pp_gm"
+            gm_acc_step = strategy.pipeline_configs['accumulate_steps']
+        if gm_acc_step > 1:
             logger.info("Gradient merge in [{}], acc step = [{}]".format(
-                self.gradient_merge_mode, self._gradient_merge_acc_step))
+                gm_mode, gm_acc_step))
 
-        # optimize offload
-        self.optimize_offload = self.user_defined_strategy.sharding_configs[
-            "optimize_offload"]
+        self.hybrid_dp_mode = dp_mode
+        self.gradient_merge_mode = gm_mode
+        self._gradient_merge_acc_step = gm_acc_step
 
         # this feature is design for ascend, and should NOT be used in GPU training
-        self.pp_allreduce_in_optimize = self.user_defined_strategy.sharding_configs[
+        self.pp_allreduce_in_optimize = sharding_configs[
             "pp_allreduce_in_optimize"]
 
+    def _inner_opt_minimize(self, loss, startup_program, parameter_list,
+                            no_grad_set):
+        pipeline_configs = self.user_defined_strategy.pipeline_configs
+
         if self.inner_opt is None:
             raise ValueError(
                 "self.inner_opt of ShardingOptimizer should not be None.")
@@ -195,32 +213,29 @@ def minimize_impl(self,
         if self.pp_degree > 1:
             pp_optimizer = fluid.optimizer.PipelineOptimizer(
                 self.inner_opt, self._gradient_merge_acc_step)
-
-            strategy = self.user_defined_strategy
-            self.schedule_mode = strategy.pipeline_configs['schedule_mode']
-            self.pp_rank_ = self.role_maker._worker_index() // (
-                self.sharding_degree * self.mp_degree) % self.pp_degree
-
-            pipeline_opt = dict()
-            pipeline_opt['schedule_mode'] = self.schedule_mode
-            pipeline_opt['micro_batch_size'] = strategy.pipeline_configs[
-                'micro_batch_size']
-            pipeline_opt['local_rank'] = self.pp_rank_
-            pipeline_opt['global_rank'] = self.role_maker._worker_index()
-            pipeline_opt['use_sharding'] = True
-            # TODO (JZ-LIANG) should revise here for support mix parallelism with pipeline
-            pipeline_opt['ring_id'] = 20
-            pipeline_opt['global_ring_id'] = 3
-            pipeline_opt['mp_degree'] = self.mp_degree
-            pipeline_opt['mp_rank'] = self.role_maker._worker_index(
-            ) % self.mp_degree
-
+            self._pp_optimizer = pp_optimizer
+
+            global_rank = self.role_maker._worker_index()
+            schedule_mode = pipeline_configs['schedule_mode']
+
+            pipeline_opt = {
+                'schedule_mode': schedule_mode,
+                'micro_batch_size': pipeline_configs['micro_batch_size'],
+                'local_rank': self.pp_rank,
+                'global_rank': global_rank,
+                'use_sharding': True,
+                # TODO (JZ-LIANG) should revise here for support mix parallelism with pipeline
+                'ring_id': 20,
+                'global_ring_id': 3,
+                'mp_degree': self.mp_degree,
+                'mp_rank': global_rank % self.mp_degree,
+            }
             main_program = loss.block.program
             main_program._pipeline_opt = pipeline_opt
 
             optimize_ops, params_grads, program_list, self.pipeline_pair, self.pp_ring_map = pp_optimizer.minimize(
                 loss, startup_program, parameter_list, no_grad_set)
-            self.pp_degree = len(program_list)
+            assert self.pp_degree == len(program_list)
         else:
             optimize_ops, params_grads = self.inner_opt.minimize(
                 loss, startup_program, parameter_list, no_grad_set)
@@ -230,9 +245,8 @@ def minimize_impl(self,
 
         if self.pp_degree > 1:
             startup_program = startup_program._pipeline_opt['startup_program']
-            #main_program = main_program._pipeline_opt['section_program']['program']
-            print("pp_rank:", self.pp_rank_)
-            main_program = program_list[self.pp_rank_]
+            print("pp_rank:", self.pp_rank)
+            main_program = program_list[self.pp_rank]
             with open("main_%d" % self.role_maker._worker_index(), 'w') as f:
                 f.writelines(str(main_program))
             main_block = main_program.global_block()
@@ -241,7 +255,6 @@ def minimize_impl(self,
                 if main_block.has_var(param.name):
                     new_params_grads.append((param, grad))
             params_grads = new_params_grads
-
         else:
             main_block = loss.block
 
@@ -254,93 +267,106 @@ def minimize_impl(self,
             with open("main_%d" % self.role_maker._worker_index(), 'w') as f:
                 f.writelines(str(main_program))
 
-        # step0: _init_comm
-        self._init_comm()
+        return optimize_ops, params_grads
 
-        if self.sharding_degree > 1:
+    def _apply_sharding_pass(self, params_grads):
+        if self.sharding_degree == 1: return
+
+        main_block = self._main_program.global_block()
+        startup_block = self._startup_program.global_block()
 
-            # step1: build shard
-            self._build_shard(params_grads)
+        # step1: build shard
+        self._build_shard(params_grads)
 
-            # step2: split_program
-            self._split_program(main_block)
+        # step2: split_program
+        self._split_program(main_block)
 
-            # step3: add broadcast and reduce ops
-            self._add_broadcast_allreduce(main_block)
-            main_block._sync_with_cpp()
-            startup_block._sync_with_cpp()
+        # step3: add broadcast and reduce ops
+        self._add_broadcast_allreduce(main_block)
+        main_block._sync_with_cpp()
+        startup_block._sync_with_cpp()
 
-            main_block._sync_with_cpp()
+        # step4: remove unneeded ops and vars from block
+        self._prune_main_program(main_block)
+        self._prune_startup_program(startup_block)
 
-            # step4: remove unneeded ops and vars from block
-            self._prune_main_program(main_block)
-            self._prune_startup_program(startup_block)
+    def _insert_allreduce_for_pp(self):
+        if self.pp_degree == 1: return
 
-        if self.pp_degree > 1:
-            # sharding-pp related logic
-            # pp_optimizer._rename_gradient_var_name(main_block)
-            # crop ops
-            if self.sharding_degree > 1:
-                for idx, op in reversed(list(enumerate(main_block.ops))):
-                    if is_update_op(op):
-                        op_role_var = op.attr('op_role_var')
-                        param_name = op_role_var[0]
-                        if not self._shard.has_param(param_name):
-                            main_block._remove_op(idx)
-
-                for idx, op in reversed(list(enumerate(main_block.ops))):
-                    if op.type != 'cast': continue
-                    in_name = op.input_arg_names[0]
-                    if in_name not in self._params: continue
-                    #if self._shard.has_param(param_name): continue
-                    if in_name not in main_block.vars:
+        strategy = self.user_defined_strategy
+        main_block = self._main_program.global_block()
+        startup_block = self._startup_program.global_block()
+
+        # sharding-pp related logic
+        # pp_optimizer._rename_gradient_var_name(main_block)
+        # crop ops
+        if self.sharding_degree > 1:
+            for idx, op in reversed(list(enumerate(main_block.ops))):
+                if is_update_op(op):
+                    op_role_var = op.attr('op_role_var')
+                    param_name = op_role_var[0]
+                    if not self._shard.has_param(param_name):
                         main_block._remove_op(idx)
 
-            accumulated_grad_names = pp_optimizer._accumulate_gradients(
-                main_block)
-            # accumulated_grad_names = sorted(accumulated_grad_names)
-            if self.pp_allreduce_in_optimize:
-                print("persistable FP32 grad: ")
-                print(accumulated_grad_names)
-                first_optimize_op_index = get_first_check_finite_and_unscale_op_idx(
-                    main_block, raise_error=self.user_defined_strategy.amp)
-                insert_reduce_ops(
+            for idx, op in reversed(list(enumerate(main_block.ops))):
+                if op.type != 'cast': continue
+                in_name = op.input_arg_names[0]
+                if in_name not in self._params: continue
+                #if self._shard.has_param(param_name): continue
+                if in_name not in main_block.vars:
+                    main_block._remove_op(idx)
+
+        accumulated_grad_names = self._pp_optimizer._accumulate_gradients(
+            main_block)
+        # accumulated_grad_names = sorted(accumulated_grad_names)
+        if self.pp_allreduce_in_optimize:
+            print("persistable FP32 grad: ")
+            print(accumulated_grad_names)
+            first_optimize_op_index = get_first_check_finite_and_unscale_op_idx(
+                main_block, raise_error=strategy.amp)
+            insert_reduce_ops(
+                main_block,
+                first_optimize_op_index,
+                self.sharding_ring_id,
+                accumulated_grad_names,
+                self._shard,
+                core.op_proto_and_checker_maker.OpRole.Optimize,
+                use_calc_stream=True)
+        if self.hybrid_dp and self.hybrid_dp_mode == "pp_hybrid_dp":
+            first_optimize_op_index = get_first_check_finite_and_unscale_op_idx(
+                main_block, raise_error=strategy.amp)
+            if first_optimize_op_index >= 0:
+                insert_allreduce_ops(
                     main_block,
                     first_optimize_op_index,
-                    self.sharding_ring_id,
+                    self.dp_ring_id,
                     accumulated_grad_names,
-                    self._shard,
                     core.op_proto_and_checker_maker.OpRole.Optimize,
-                    use_calc_stream=True)
-            if self.hybrid_dp and self.hybrid_dp_mode == "pp_hybrid_dp":
-                first_optimize_op_index = get_first_check_finite_and_unscale_op_idx(
-                    main_block, raise_error=self.user_defined_strategy.amp)
-                if first_optimize_op_index >= 0:
-                    insert_allreduce_ops(
-                        main_block,
-                        first_optimize_op_index,
-                        self.dp_ring_id,
-                        accumulated_grad_names,
-                        core.op_proto_and_checker_maker.OpRole.Optimize,
-                        use_calc_stream=True,
-                        user_defined_strategy=self.user_defined_strategy)
+                    use_calc_stream=True,
+                    user_defined_strategy=strategy)
 
+    def _adapt_amp_clip_without_sharding(self):
+        if self.sharding_degree > 1: return
         # if not use sharding, adapt amp/clip, for remain parallelism.
         # cast --> amp --> clip --> opt
-        if self.sharding_degree <= 1:
-            # FIXME(wangxi): mp should prune duplicated param_grads when calc
-            # amp inf_var & clip global_norm_var
 
-            # amp
-            FP16Utils.sync_amp_check_nan_inf(
-                main_block, [self.mp_ring_id, self.pp_ring_id])
+        main_block = self._main_program.global_block()
+        startup_block = self._startup_program.global_block()
+
+        # FIXME(wangxi): mp should prune duplicated param_grads when calc
+        # amp inf_var & clip global_norm_var
 
-            # clip
-            gradientclip_helper = GradientClipHelper(None)
-            gradientclip_helper.sync_global_norm(
-                main_block, [self.mp_ring_id, self.pp_ring_id])
+        FP16Utils.sync_amp_check_nan_inf(main_block,
+                                         [self.mp_ring_id, self.pp_ring_id])
 
-        # step6: loss div dp_degree 
+        gradientclip_helper = GradientClipHelper(None)
+        gradientclip_helper.sync_global_norm(
+            main_block, [self.mp_ring_id, self.pp_ring_id])
+
+    def _insert_loss_grad_scale_op(self):
+        main_block = self._main_program.global_block()
+
+        # step6: loss div dp_degree
         global_dp_degree = self.sharding_degree * self.dp_degree
         assert int(global_dp_degree) == global_dp_degree
         if global_dp_degree > 1:
@@ -348,18 +374,67 @@ def minimize_impl(self,
 
         main_block._sync_with_cpp()
 
-        # TODO(wangxi): add optimize offload
-        # opt offload should be enable while gradient merge is enable && acc_step is quite large (e.g. >> 100) 
-        # sync its memcpy could not be overlap with calc, otherwise it will slower down training severely. 
-        if self.optimize_offload:
+    def _apply_optimize_offload_pass(self):
+        strategy = self.user_defined_strategy
+        sharding_configs = strategy.sharding_configs
+        main_block = self._main_program.global_block()
+        startup_block = self._startup_program.global_block()
+
+        # optimize offload should be enable while gradient merge is enable and
+        # acc_step is quite large (e.g. >> 100). Since its memcpy could not be
+        # overlap with calc, otherwise it will slower down training severely.
+        if sharding_configs["optimize_offload"]:
             logger.info("Sharding with optimize offload !")
             offload_helper = OffloadHelper()
             offload_helper.offload(main_block, startup_block)
             offload_helper.offload_fp32param(main_block, startup_block)
 
+    def _dump_program_for_debug(self):
+        main_block = self._main_program.global_block()
+        startup_block = self._startup_program.global_block()
+        with open("start_sharding_%d" % self.role_maker._worker_index(),
+                  'w') as f:
+            f.writelines(str(startup_block.program))
+        with open("main_sharding_%d" % self.role_maker._worker_index(),
+                  'w') as f:
+            f.writelines(str(main_block.program))
+
+    def minimize_impl(self,
+                      loss,
+                      startup_program=None,
+                      parameter_list=None,
+                      no_grad_set=None):
+        # TODO: (JZ-LIANG) support multiple comm in future
+        # self._nrings = self.user_defined_strategy.nccl_comm_num
+        self._nrings_sharding = 1
+        self._nrings_dp = 1
+
+        self._get_sharding_segment_strategy()
+        self._get_hybrid_degree()
+        self._get_hybrid_dp_mode()
+
+        # config sharding & dp groups
+        self._build_groups()
+
+        # inner optimize minimize
+        optimize_ops, params_grads = self._inner_opt_minimize(
+            loss, startup_program, parameter_list, no_grad_set)
+
+        self._init_comm()
+
+        self._apply_sharding_pass(params_grads)
+
+        self._insert_allreduce_for_pp()
+
+        self._adapt_amp_clip_without_sharding()
+
+        # loss div dp_degree
+        self._insert_loss_grad_scale_op()
+
+        self._apply_optimize_offload_pass()
+
         # step6: (optional) sharding gradient merge
-        if self.gradient_merge_mode == "sharding_gm" and self._gradient_merge_acc_step > 1:
-            self._sharding_gradient_merge(main_block)
+        self._sharding_gradient_merge()
 
         # # check op dependecy
         # FIXME (JZ-LIANG) enable checking in future.
@@ -367,20 +442,15 @@ def minimize_impl(self,
         # check_allreduce_sum(main_block, self._shard, self.sharding_ring_id,
         #                     self.dp_ring_id)
 
-        if self.hybrid_dp:
-            # NOTE(JZ-LIANG) ensure in both sharding_hybrid_dp & pp_hybrid_dp 
-            # init param broadcast should be called after startup pruning             
-            self._initialization_broadcast(startup_block)
+        # NOTE(JZ-LIANG) ensure in both sharding_hybrid_dp & pp_hybrid_dp
+        # init param broadcast should be called after startup pruning
+        self._initialization_broadcast()
 
-        with open("start_sharding_%d" % self.role_maker._worker_index(),
-                  'w') as f:
-            f.writelines(str(startup_block.program))
-        with open("main_sharding_%d" % self.role_maker._worker_index(),
-                  'w') as f:
-            f.writelines(str(main_block.program))
+        self._dump_program_for_debug()
 
-        # GPU and NPU need to wait server ready
-        self._wait()
+        # GPU need to wait server ready, GPU and NPU is Layered connection
+        if not core.is_compiled_with_npu():
+            self._wait()
         return optimize_ops, params_grads
 
     def _init_pair_comm(self, pair, ring_id):
@@ -470,9 +540,6 @@ def _init_npu_pipeline_comm(self, startup_block):
 
     def _init_pipeline_comm(self, startup_block):
         # TODO (JZ-LIANG) to unify pp_rank_ and pp_rank
-        assert self.pp_rank_ == self.pp_rank, "pp rank for pp opt [{}], pp rank for sharding opt [{}]".format(
-            self.pp_rank_, self.pp_rank)
-
         self._collective_helper._init_communicator(
             self._startup_program,
             self.current_endpoint,
@@ -495,17 +562,8 @@ def _init_pipeline_comm(self, startup_block):
                 self._init_pair_comm(pair, ring_id)
 
     def _init_comm(self):
-
-        # config sharding & dp groups
-        self._build_groups()
-
         # sync var
         startup_block = self._startup_program.global_block()
-        self.startup_prog_sync_var = startup_block.create_var(
-            name="startup_prog_sync_var",
-            shape=[1],
-            dtype=core.VarDesc.VarType.INT32,
-            persistable=False)
 
         # mp ring
         if self.mp_degree > 1:
@@ -1050,7 +1108,8 @@ def _build_groups(self):
             sharding: 1
             pure-dp: 2
             global: 3
-            pp: >= 20
+            pp: 4
+            pp-pair: >= 20
         if one parallelism is not enable: -1
         and only support parallelism hierarchy: mp --> sharding --> pp --> dp        
         """
@@ -1215,11 +1274,16 @@ def _build_groups(self):
 
         return
 
-    def _initialization_broadcast(self, startup_block):
+    def _initialization_broadcast(self):
         """
         this funtion is to ensure the initialization between dp group to be 
         identical when hybrid-dp is used.
         """
+        if not self.hybrid_dp:
+            return
+
+        startup_block = self._startup_program.global_block()
+
         params = []
         for param in startup_block.iter_parameters():
             params.append(param)
@@ -1460,13 +1524,17 @@ def _true_apply_gradient(self):
         # lr_var = main_block.var("gradient_merge_current_step")
         # paddle.static.Print(lr_var, message="in OPTIMIZE last conditional")
 
-    def _sharding_gradient_merge(self, main_block):
+    def _sharding_gradient_merge(self):
         """
         copy all optimize ops in origin main block
         remove all optimize ops in origin main block
         create cond block
 
         """
+        if self.gradient_merge_mode != "sharding_gm" or self._gradient_merge_acc_step <= 1:
+            return
+
+        main_block = self._main_program.global_block()
         # copy original optimize ops to temp ops desc list
         # remove them from block 0
         tmp_copy_block = self._main_program._create_block()
diff --git a/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py b/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py
index 1cec106caec82b..16ea7de2946bfd 100644
--- a/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py
+++ b/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py
@@ -64,18 +64,6 @@ def __init__(self, layers, hcg, strategy):
             logger.info("start broadcast dp parameters")
             broadcast_dp_parameters(self._layers, self._hcg)
 
-    def _set_tensor_trainable(self, tensor):
-        if tensor is None:
-            return
-
-        if isinstance(tensor, tuple):
-            for t in tensor:
-                if is_float_tensor(t):
-                    t.stop_gradient = False
-        else:
-            if is_float_tensor(tensor):
-                tensor.stop_gradient = False
-
     def train_batch(self, data, optimizer, lr_scheduler=None, scaler=None):
         assert isinstance(optimizer, HybridParallelOptimizer), (
             'optimizer should be HybridParallelOptimizer subclass.')
@@ -117,7 +105,6 @@ def train_batch(self, data, optimizer, lr_scheduler=None, scaler=None):
 
         for step_id in range(startup_steps):
             input_tensor = p2p.recv_forward()
-            self._set_tensor_trainable(input_tensor)
 
             output_tensor = self._forward_step(input_tensor)
             p2p.send_forward(output_tensor)
@@ -131,7 +118,6 @@ def train_batch(self, data, optimizer, lr_scheduler=None, scaler=None):
         for i in range(steady_steps):
             last_iter = (i == (steady_steps - 1))
 
-            self._set_tensor_trainable(input_tensor)
             output_tensor = self._forward_step(input_tensor)
 
             output_tensor_grad = p2p.send_forward_recv_backward(output_tensor)
diff --git a/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py b/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py
index e533b2ef3f7a33..c508c88015cfda 100644
--- a/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py
+++ b/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py
@@ -15,6 +15,8 @@
 import paddle
 from .utils import paddle_2_number, number_2_dtype
 from ...utils.log_util import logger
+import numpy as np
+from paddle import _C_ops
 
 _hcg = None
 
@@ -40,6 +42,7 @@ def __init__(self):
 
         self.recv_shape_message = None
         self.recv_dtype_message = None
+        self.recv_stop_gradient = None
 
         self.has_send_meta = False
         self.has_recv_meta = False
@@ -57,7 +60,11 @@ def _recv_shape_dtype(self, group):
         # recv dtype
         dtype = paddle.to_tensor([0])
         paddle.distributed.recv(dtype, src=0, group=group)
-        return shape.numpy().tolist(), dtype.item()
+
+        # recv stop_gradient
+        stop_grad = paddle.to_tensor([0])
+        paddle.distributed.recv(stop_grad, src=0, group=group)
+        return shape.numpy().tolist(), dtype.item(), stop_grad.item()
 
     def recv_meta(self, group):
         tensor_type = paddle.to_tensor([0])
@@ -65,9 +72,10 @@ def recv_meta(self, group):
         tensor_type = tensor_type.item()
 
         if tensor_type == 0:
-            shape, dtype = self._recv_shape_dtype(group)
+            shape, dtype, stop_grad = self._recv_shape_dtype(group)
             self.recv_shape_message = shape
             self.recv_dtype_message = dtype
+            self.recv_stop_gradient = bool(stop_grad)
 
         elif tensor_type == 1:
             num = paddle.to_tensor([0])
@@ -75,13 +83,16 @@ def recv_meta(self, group):
             num = num.item()
             shapes = []
             dtypes = []
+            stop_grads = []
             for i in range(num):
-                shape, dtype = self._recv_shape_dtype(group)
+                shape, dtype, stop_grad = self._recv_shape_dtype(group)
                 shapes.append(shape)
                 dtypes.append(dtype)
+                stop_grads.append(bool(stop_grad))
 
             self.recv_shape_message = tuple(shapes)
             self.recv_dtype_message = tuple(dtypes)
+            self.recv_stop_gradient = tuple(stop_grads)
 
     def _send_dims_shape_dtype(self, tensor, group):
         # send len(shape)
@@ -96,6 +107,10 @@ def _send_dims_shape_dtype(self, tensor, group):
         dtype = paddle.to_tensor(paddle_2_number(tensor.dtype))
         paddle.distributed.send(dtype, dst=1, group=group)
 
+        # send trainable
+        stop_grad = paddle.to_tensor(int(tensor.stop_gradient))
+        paddle.distributed.send(stop_grad, dst=1, group=group)
+
     def send_meta(self, tensor, group):
         if isinstance(tensor, paddle.Tensor):
             tensor_type = paddle.to_tensor([0])
@@ -129,6 +144,12 @@ def set_send_message(self, tensor):
 _send_recv_meta = SendRecvMeta()
 
 
+def _is_valid_send_recv_partial(tensor, mp_degree):
+    tensor_numel = np.prod(tensor.shape)
+    assert tensor_numel != 0, "can't send/recv zero element"
+    return mp_degree > 1 and tensor_numel % mp_degree == 0
+
+
 def send_partial(tensor,
                  dst=0,
                  nranks=1,
@@ -138,9 +159,17 @@ def send_partial(tensor,
     if group is not None and not group.is_member():
         return
     ring_id = 0 if group is None else group.id
-    return paddle.fluid.core.ops.partial_send(
-        tensor, 'use_calc_stream', use_calc_stream, 'ring_id', ring_id, 'peer',
-        dst, 'num', nranks, 'id', rank_id)
+
+    if _is_valid_send_recv_partial(tensor, nranks):
+        return _C_ops.partial_send(tensor.detach(), 'use_calc_stream',
+                                   use_calc_stream, 'ring_id', ring_id, 'peer',
+                                   dst, 'num', nranks, 'id', rank_id)
+    else:
+        return paddle.distributed.send(
+            tensor.detach(),
+            dst=dst,
+            group=group,
+            use_calc_stream=use_calc_stream)
 
 
 def recv_partial(tensor,
@@ -153,10 +182,17 @@ def recv_partial(tensor,
         return
     ring_id = 0 if group is None else group.id
 
-    paddle.fluid.core.ops.partial_recv(
-        tensor, 'use_calc_stream', use_calc_stream, 'ring_id', ring_id, 'peer',
-        src, 'num', nranks, 'id', rank_id, 'dtype', tensor.dtype, 'out_shape',
-        tensor.shape)
+    if _is_valid_send_recv_partial(tensor, nranks):
+        _C_ops.partial_recv(tensor.detach(), 'use_calc_stream', use_calc_stream,
+                            'ring_id', ring_id, 'peer', src, 'num', nranks,
+                            'id', rank_id, 'dtype', tensor.dtype, 'out_shape',
+                            tensor.shape)
+    else:
+        paddle.distributed.recv(
+            tensor.detach(),
+            src=src,
+            group=group,
+            use_calc_stream=use_calc_stream)
 
 
 def allgather_partial(tensor,
@@ -164,15 +200,15 @@ def allgather_partial(tensor,
                       rank_id=0,
                       group=None,
                       use_calc_stream=True):
-    if nranks == 1:
+    if not _is_valid_send_recv_partial(tensor, nranks):
         return tensor
     if group is not None and not group.is_member():
         return
     ring_id = 0 if group is None else group.id
 
-    return paddle.fluid.core.ops.partial_allgather_(
-        tensor, 'use_calc_stream', use_calc_stream, 'ring_id', ring_id,
-        'nranks', nranks, 'rank', rank_id)
+    return _C_ops.partial_allgather_(tensor.detach(), 'use_calc_stream',
+                                     use_calc_stream, 'ring_id', ring_id,
+                                     'nranks', nranks, 'rank', rank_id)
 
 
 def _p2p_helper(tensor_send_next, tensor_send_prev, recv_prev, recv_next):
@@ -184,6 +220,8 @@ def _p2p_helper(tensor_send_next, tensor_send_prev, recv_prev, recv_next):
     # send / recv message
     recv_shape_msg = _send_recv_meta.recv_shape_message
     recv_dtype_msg = _send_recv_meta.recv_dtype_message
+    recv_stop_gradient = _send_recv_meta.recv_stop_gradient
+
     send_shape_msg = _send_recv_meta.send_shape_message
     send_dtype_msg = _send_recv_meta.send_dtype_message
 
@@ -196,13 +234,16 @@ def _p2p_helper(tensor_send_next, tensor_send_prev, recv_prev, recv_next):
         if isinstance(recv_shape_msg, tuple):
             tensor_recv_prev = []
             for idx, shape in enumerate(recv_shape_msg):
-                tensor_recv_prev.append(
-                    paddle.empty(
-                        shape=shape, dtype=number_2_dtype(recv_dtype_msg[idx])))
+                tmp = paddle.empty(
+                    shape=shape, dtype=number_2_dtype(recv_dtype_msg[idx]))
+                tmp.stop_gradient = recv_stop_gradient[idx]
+                tensor_recv_prev.append(tmp)
             tensor_recv_prev = tuple(tensor_recv_prev)
         else:
+
             tensor_recv_prev = paddle.empty(
                 shape=recv_shape_msg, dtype=number_2_dtype(recv_dtype_msg))
+            tensor_recv_prev.stop_gradient = recv_stop_gradient
 
     if recv_next:
         if isinstance(send_shape_msg, tuple):
diff --git a/python/paddle/distributed/fleet/utils/recompute.py b/python/paddle/distributed/fleet/utils/recompute.py
index 78503baf2fd5d2..89b14258c195ca 100755
--- a/python/paddle/distributed/fleet/utils/recompute.py
+++ b/python/paddle/distributed/fleet/utils/recompute.py
@@ -145,23 +145,25 @@ def backward(ctx, *args):
 
             # run backward() with only tensor that requires grad
             forward_outputs_with_grad = []
-            backward_inputs = list(args)
+            # NOTE In Transformer-like network, if user put the attention mask into the recompute segment output,
+            # pylayer will force the stop_gradient of attention mask to be False, which will make the number of 
+            # tensor that need grad does not match.
+            # the following backward_inputs_with_grad is used to avoid this case.
+            backward_inputs_with_grad = []
             for i in range(len(outputs)):
                 if isinstance(outputs[i],
                               core.VarBase) and not outputs[i].stop_gradient:
                     forward_outputs_with_grad.append(outputs[i])
+                    backward_inputs_with_grad.append(args[i])
+
             if len(forward_outputs_with_grad) == 0:
                 raise RuntimeError(
                     "none of output has requires_grad=True, this recompute() is not necessary"
                 )
 
-            assert len(backward_inputs) == len(
-                forward_outputs_with_grad
-            ), "number of forward outputs is [{}], but the backward got [{}] inputs".format(
-                len(forward_outputs_with_grad), len(backward_inputs))
-
             # actually backward            
-            paddle.autograd.backward(forward_outputs_with_grad, backward_inputs)
+            paddle.autograd.backward(forward_outputs_with_grad,
+                                     backward_inputs_with_grad)
 
             grads = list(inp._grad_ivar() for inp in detached_inputs
                          if isinstance(inp, core.VarBase))
diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py
index 5c2f305c8dca0c..8bf27f6d2fd988 100755
--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -16,6 +16,7 @@
 from .proto import framework_pb2
 
 from paddle.fluid import framework as framework
+from paddle.fluid import program_guard
 from . import core
 import collections
 import copy
@@ -944,6 +945,13 @@ def _append_backward_ops_with_checkpoints_(
         for op_desc in reversed(added_descs):
             grad_op_desc, op_grad_to_var = core.get_grad_op_desc(
                 op_desc, cpt.to_text(no_grad_dict[block.idx]), [])
+
+            # Set device for grad_op according to forward Op
+            if op_desc.has_attr(device_attr_name):
+                op_device = op_desc.attr(device_attr_name)
+                for g_op_desc in grad_op_desc:
+                    g_op_desc._set_attr(device_attr_name, op_device)
+
             for key in var_name_dict:
                 _rename_arg_(grad_op_desc, key, var_name_dict[key])
             grad_op_descs.extend(grad_op_desc)
@@ -2015,3 +2023,72 @@ def gradients(targets, inputs, target_gradients=None, no_grad_set=None):
 
     outs = calc_gradient(targets, inputs, target_gradients, no_grad_set)
     return _as_list(outs)
+
+
+@framework.static_only
+def gradients_with_optimizer(program, optimizer, inputs=None, outputs=None):
+    """
+    :api_attr: Static Graph
+
+    Backpropagate the gradients of the program and apply the gradients with the given optimizer.
+
+    Args:
+        program (Program): The input program.
+        optimizer (Optimizer): The optimizer to apply the gradients.
+        inputs (Tensor|list[Tensor]|tuple[Tensor], optional): The input Tensors.
+            If None, the inputs will be created from the input variables in the given program. Default:None.
+        outputs (Tensor|list[Tensor]|tuple[Tensor], optional): The output Tensors.
+            If None, the outputs will be created from the output variables in the given program. Default: None.
+
+    Return:
+        tuple: tuple (optimize_ops, params_grads), A list of operators appended
+            by gradients_with_optimizer and a list of (param, grad) variable pairs, param is
+            ``Parameter``, grad is the gradient value corresponding to the parameter.
+            The returned tuple can be passed to ``fetch_list`` in ``Executor.run()`` to
+            indicate program pruning. If so, the program will be pruned by ``feed`` and
+            ``fetch_list`` before run, see details in ``Executor``.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.static as static
+
+            paddle.enable_static()
+
+            img = static.data(name='image', shape=[None, 784])
+            pred = static.nn.fc(x=img, size=10, activation='relu')
+            loss = paddle.mean(pred)
+            opt_ops, pram_grads = paddle.fluid.backward.gradients_with_optimizer(static.default_main_program(), opt)
+            print(opt_ops)
+
+    """
+    check_type(program, 'program', paddle.fluid.Program,
+               'paddle.static.gradients_with_optimizer')
+    check_type(optimizer, 'optimizer', paddle.optimizer.Optimizer,
+               'paddle.static.gradients_with_optimizer')
+
+    if inputs is None or outputs is None:
+        in_set = set()
+        out_set = set()
+        for block in program.blocks:
+            for op in block.ops:
+                for name in op.input_arg_names:
+                    in_set.add(block.vars[name])
+                for name in op.output_arg_names:
+                    out_set.add(block.vars[name])
+        if inputs is None:
+            inputs = list(in_set.difference(out_set))
+        if outputs is None:
+            outputs = list(out_set.difference(in_set))
+
+    grads = gradients(outputs, inputs)
+
+    with program_guard(program, None):
+        pram_grads = [(pram, grad) for pram, grad in zip(inputs, grads)
+                      if isinstance(pram, paddle.fluid.framework.Parameter) and
+                      grad is not None]
+
+        optimize_ops = optimizer.apply_gradients(pram_grads)
+
+    return optimize_ops, pram_grads
diff --git a/python/paddle/fluid/clip.py b/python/paddle/fluid/clip.py
index 8fd01509331e20..04fb45cd3ae22d 100644
--- a/python/paddle/fluid/clip.py
+++ b/python/paddle/fluid/clip.py
@@ -19,11 +19,15 @@
 import warnings
 
 import functools
+import paddle
 from . import layers
 from . import framework
 from . import core
 from . import name_scope
 from .dygraph import base as imperative_base
+from .data_feeder import check_variable_and_dtype
+from .framework import in_dygraph_mode
+from .layer_helper import LayerHelper
 
 __all__ = [
     'set_gradient_clip', 'ErrorClipByValue', 'ClipGradByValue',
@@ -31,6 +35,30 @@
 ]
 
 
+def _squared_l2_norm(x):
+    r"""
+    This OP returns the squared L2 norm of a tensor.
+    """
+
+    if core.is_compiled_with_npu() or core.is_compiled_with_xpu():
+        square = layers.square(x)
+        sum_square = layers.reduce_sum(square)
+        return sum_square
+
+    if in_dygraph_mode():
+        return core.ops.squared_l2_norm(x)
+
+    op_type = 'squared_l2_norm'
+    check_variable_and_dtype(x, 'x', ['float32'], op_type)
+    helper = LayerHelper(op_type, **locals())
+    out = helper.create_variable_for_type_inference(x.dtype)
+
+    inputs = {"X": x}
+    outputs = {'Out': out}
+    helper.append_op(type=op_type, inputs=inputs, outputs=outputs)
+    return out
+
+
 class BaseErrorClipAttr(object):
     def __str__(self):
         raise NotImplementedError()
@@ -258,18 +286,18 @@ class ClipGradByNorm(ClipGradBase):
 
     .. math::
         Out =
-        \\left \{
-        \\begin{aligned}
-        & X & & if (norm(X) \\leq clip\_norm) \\\\
-        & \\frac{clip\_norm*X}{norm(X)} & & if (norm(X) > clip\_norm) \\\\
-        \\end{aligned}
-        \\right.
+        \left\{
+            \begin{array}{ccl}
+                X & & if (norm(X) \leq clip\_norm) \\
+                \frac{clip\_norm*X}{norm(X)} & & if (norm(X) > clip\_norm) \\
+        \end{array}
+        \right.
 
 
     where :math:`norm(X)` represents the L2 norm of :math:`X`.
 
     .. math::
-        norm(X) = ( \\sum_{i=1}^{n}|x\_i|^2)^{ \\frac{1}{2}}
+        norm(X) = ( \sum_{i=1}^{n}|x\_i|^2)^{ \frac{1}{2}}
 
     Note:
         ``need_clip`` of ``ClipGradByNorm`` HAS BEEN DEPRECATED since 2.0. 
@@ -361,7 +389,7 @@ class ClipGradByGlobalNorm(ClipGradBase):
 
     .. math::
 
-        t\_list[i] = t\_list[i] * \\frac{clip\_norm}{\max(global\_norm, clip\_norm)}
+        t\_list[i] = t\_list[i] * \frac{clip\_norm}{\max(global\_norm, clip\_norm)}
 
     where:
 
@@ -416,8 +444,8 @@ def _dygraph_clip(self, params_grads):
             if g.type == core.VarDesc.VarType.SELECTED_ROWS:
                 merge_grad = layers.merge_selected_rows(g)
                 merge_grad = layers.get_tensor_from_selected_rows(merge_grad)
-            square = layers.square(merge_grad)
-            sum_square = layers.reduce_sum(square)
+
+            sum_square = _squared_l2_norm(merge_grad)
             sum_square_list.append(sum_square)
 
         # all parameters have been filterd out
@@ -439,6 +467,7 @@ def _dygraph_clip(self, params_grads):
             if getattr(p, 'need_clip', True) is False:
                 params_and_grads.append((p, g))
                 continue
+            # TODO(wangxi): use inplace elementwise_mul
             new_grad = layers.elementwise_mul(x=g, y=clip_var)
             params_and_grads.append((p, new_grad))
 
@@ -460,8 +489,7 @@ def _static_clip(self, params_grads):
                         merge_grad = layers.get_tensor_from_selected_rows(
                             merge_grad)
 
-                    square = layers.square(merge_grad)
-                    sum_square = layers.reduce_sum(input=square)
+                    sum_square = _squared_l2_norm(merge_grad)
                     sum_square_list.append(sum_square)
 
             # all parameters have been filterd out
@@ -489,9 +517,14 @@ def _static_clip(self, params_grads):
                     continue
 
                 with p.block.program._optimized_guard([p, g]):
-                    new_grad = layers.elementwise_mul(x=g, y=scale_var)
-                param_new_grad_name_dict[p.name] = new_grad.name
-                params_and_grads.append((p, new_grad))
+                    # inplace
+                    p.block.append_op(
+                        type='elementwise_mul',
+                        inputs={'X': g,
+                                'Y': scale_var},
+                        outputs={'Out': g})
+                param_new_grad_name_dict[p.name] = g.name
+                params_and_grads.append((p, g))
 
         _correct_clip_op_role_var(params_and_grads, param_new_grad_name_dict)
         return params_and_grads
@@ -513,8 +546,7 @@ def _process_context(self, context, param, grad):
             merge_grad = layers.merge_selected_rows(grad)
             merge_grad = layers.get_tensor_from_selected_rows(merge_grad)
 
-        square = layers.square(merge_grad)
-        local_norm_var = layers.reduce_sum(input=square)
+        local_norm_var = _squared_l2_norm(merge_grad)
         context[self.group_name].append(local_norm_var)
 
         self.context = context
@@ -532,10 +564,14 @@ def _create_operators(self, param, grad):
             assert group_scale_var.shape == (1, )
             self.context[group_scale_name] = group_scale_var
 
-        new_grad = layers.elementwise_mul(
-            x=grad, y=self.context[group_scale_name])
+        # inplace
+        param.block.append_op(
+            type='elementwise_mul',
+            inputs={'X': grad,
+                    'Y': self.context[group_scale_name]},
+            outputs={'Out': grad})
 
-        return param, new_grad
+        return param, grad
 
 
 @framework.dygraph_not_support
@@ -709,7 +745,7 @@ def _correct_clip_op_role_var(params_grads, param_new_grad_name_dict):
             continue
         block_id_list.append(block_id)
         for op in param.block.program.global_block().ops:
-            if 'op_namescope' in op.all_attrs() and "gradient_clip" in op.attr(
+            if op.has_attr("op_namescope") and "gradient_clip" in op.attr(
                     "op_namescope") and op.attr('op_role_var'):
                 param_name = op.attr('op_role_var')[0]
                 if param_name in param_new_grad_name_dict:
diff --git a/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py b/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py
index 37fe1e505f02d9..703146736e3c18 100644
--- a/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py
+++ b/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py
@@ -150,6 +150,8 @@ def _update_list(self):
     'c_identity',
     'c_concat',
     'c_allreduce_sum',
+    'concat',
+    'split',
 }
 
 # The set of ops that don't support fp16 calculation
diff --git a/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py b/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
index 16dfb2bd50c141..5978d3829aecae 100644
--- a/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
+++ b/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
@@ -110,6 +110,27 @@ def _insert_cast_op(block, op, idx, src_dtype, dest_dtype):
                 cast_name = in_var.name + '.cast_' + _dtype_to_str(dest_dtype)
                 out_var = block.vars.get(cast_name)
                 if out_var is None or out_var.dtype != dest_dtype:
+                    op_device = op.attr('op_device')
+                    # NOTE(wangxi): optimize for pipeline, reduce one send.
+                    # if in_var is stop_gradient and prev_op device is `all`,
+                    # set cast_op device to `all`, can reduce send cast_var.
+                    # TODO: need remove this after we unified the dynamic
+                    # and static pipeline interface.
+                    if src_dtype == core.VarDesc.VarType.FP32 and in_var.stop_gradient:
+                        prev_op = None
+                        if in_var.op is op:
+                            prev_op = find_true_prev_op(block.ops, op,
+                                                        in_var_name)
+                        elif in_var.op is not None:
+                            prev_op = in_var.op
+
+                        prev_op_device = None
+                        if prev_op is not None:
+                            prev_op_device = prev_op.attr('op_device')
+
+                        if prev_op_device is not None and 'all' in prev_op_device:
+                            op_device = prev_op_device
+
                     out_var = block.create_var(
                         name=cast_name,
                         dtype=dest_dtype,
@@ -124,7 +145,7 @@ def _insert_cast_op(block, op, idx, src_dtype, dest_dtype):
                         attrs={
                             "in_dtype": in_var.dtype,
                             "out_dtype": out_var.dtype,
-                            "op_device": op.attr("op_device")
+                            "op_device": op_device
                         })
                     num_cast_ops += 1
                 _rename_arg(op, in_var.name, out_var.name)
diff --git a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
index 5996e752c8c22d..06f3f5f3afa750 100644
--- a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
+++ b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
@@ -578,6 +578,7 @@ def _sample_mse(self):
             var_tensor = _load_variable_data(self._scope, var_name)
             var_tensor = var_tensor.flatten()
             abs_max_value = float(np.max(np.abs(var_tensor)))
+            abs_max_value = 1e-8 if abs_max_value == 0.0 else abs_max_value
             s = 0.3
             if var_name not in self._best_mse_loss:
                 self._best_mse_loss[var_name] = float('inf')
diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
index b3b12a477e2a0a..9917730daa543f 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
@@ -1312,6 +1312,7 @@ def _insert_post_dequant_op(self, graph, op_node):
                 assert self._is_float(
                     scale_v), 'The scale of parameter %s is not a float.' % (
                         original_var_name)
+                scale_v = 1e-8 if scale_v == 0.0 else scale_v
                 max_range *= param_range / scale_v
             else:
                 max_range *= act_range
@@ -1413,6 +1414,7 @@ def _clip(x, scale):
                     x[:, i] = _clip(x[:, i], s)
                     x[:, i] = np.round(x[:, i] / s * bnt)
         else:
+            scale = 1e-8 if scale == 0.0 else scale
             x = _clip(x, scale)
             x = np.round(x / scale * bnt)
         return x
diff --git a/python/paddle/fluid/dygraph/amp/auto_cast.py b/python/paddle/fluid/dygraph/amp/auto_cast.py
index bd464450aef7f4..a7eb0d31b7f858 100644
--- a/python/paddle/fluid/dygraph/amp/auto_cast.py
+++ b/python/paddle/fluid/dygraph/amp/auto_cast.py
@@ -90,6 +90,17 @@ def _update_list(custom_white_list, custom_black_list):
     return _white_list, _black_list
 
 
+def _in_amp_guard():
+    """
+    Judge whether current code block is in `amp_guard` context.
+    """
+    tracer = _dygraph_tracer()
+    if tracer:
+        return tracer._enable_autocast
+    else:
+        return False
+
+
 @signature_safe_contextmanager
 @dygraph_only
 def amp_guard(enable=True, custom_white_list=None, custom_black_list=None):
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/assert_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/assert_transformer.py
index fe70fd1094f581..e2fcf4f2c2712e 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/assert_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/assert_transformer.py
@@ -14,7 +14,7 @@
 
 from __future__ import print_function
 
-import gast
+from paddle.utils import gast
 
 from paddle.fluid.dygraph.dygraph_to_static.static_analysis import AstNodeWrapper
 from paddle.fluid.dygraph.dygraph_to_static.utils import ast_to_source_code
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py
index 29eee429ef66ab..74f946acedb27f 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py
@@ -18,7 +18,7 @@
 # It provides a compatibility layer between the AST of various Python versions,
 # as produced by ast.parse from the standard ast module.
 # See details in https://github.com/serge-sans-paille/gast/
-import gast
+from paddle.utils import gast
 from paddle.fluid.dygraph.dygraph_to_static.assert_transformer import AssertTransformer
 from paddle.fluid.dygraph.dygraph_to_static.basic_api_transformer import BasicApiTransformer
 from paddle.fluid.dygraph.dygraph_to_static.break_continue_transformer import BreakContinueTransformer
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/basic_api_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/basic_api_transformer.py
index 5ea1fdfac0928a..acf2c3ec09b5d5 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/basic_api_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/basic_api_transformer.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import astor
-import gast
+from paddle.utils import gast
 
 from paddle.fluid.dygraph.dygraph_to_static.static_analysis import AstNodeWrapper
 from paddle.fluid.dygraph.dygraph_to_static import utils
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/break_continue_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/break_continue_transformer.py
index cb0383b9f73623..401ad1c8e84e45 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/break_continue_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/break_continue_transformer.py
@@ -14,7 +14,7 @@
 
 from __future__ import print_function
 
-import gast
+from paddle.utils import gast
 
 from paddle.fluid import unique_name
 from paddle.fluid.dygraph.dygraph_to_static.utils import index_in_list
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/call_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/call_transformer.py
index c2481d16825ec8..3e606139245d60 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/call_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/call_transformer.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 from __future__ import print_function
-import gast
+from paddle.utils import gast
 
 from paddle.fluid.dygraph.dygraph_to_static.static_analysis import AstNodeWrapper
 from paddle.fluid.dygraph.dygraph_to_static.utils import ast_to_source_code
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/cast_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/cast_transformer.py
index 1171b5dbdfa22a..ef2d062d2d0187 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/cast_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/cast_transformer.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 from __future__ import print_function
-import gast
+from paddle.utils import gast
 
 from paddle.fluid.dygraph.dygraph_to_static.static_analysis import AstNodeWrapper
 from paddle.fluid.dygraph.dygraph_to_static.utils import ast_to_source_code
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/grad_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/grad_transformer.py
index 272d480c5b7a20..98045b3aae4322 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/grad_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/grad_transformer.py
@@ -14,7 +14,7 @@
 
 from __future__ import print_function
 
-import gast
+from paddle.utils import gast
 import warnings
 
 from paddle.fluid.dygraph.dygraph_to_static.static_analysis import AstNodeWrapper
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/ifelse_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/ifelse_transformer.py
index 5bc1c3d96d9c95..8fc5a691d212c2 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/ifelse_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/ifelse_transformer.py
@@ -22,7 +22,7 @@
 # It provides a compatibility layer between the AST of various Python versions,
 # as produced by ast.parse from the standard ast module.
 # See details in https://github.com/serge-sans-paille/gast/
-import gast
+from paddle.utils import gast
 from paddle.fluid import unique_name
 
 from paddle.fluid.dygraph.dygraph_to_static.utils import create_funcDef_node, ast_to_source_code
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/list_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/list_transformer.py
index a3311765a996f6..e041fe7c9ac37c 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/list_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/list_transformer.py
@@ -15,7 +15,7 @@
 from __future__ import print_function
 
 import astor
-import gast
+from paddle.utils import gast
 
 from paddle.fluid.dygraph.dygraph_to_static.static_analysis import AstNodeWrapper, StaticAnalysisVisitor
 from paddle.fluid.dygraph.dygraph_to_static.utils import ast_to_source_code
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/logical_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/logical_transformer.py
index 8470e895dd3c89..e5c093f9a9255c 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/logical_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/logical_transformer.py
@@ -14,7 +14,7 @@
 
 from __future__ import print_function
 
-import gast
+from paddle.utils import gast
 from paddle.fluid.dygraph.dygraph_to_static.utils import ast_to_source_code
 
 cmpop_type_to_str = {
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py
index 14bb54983b524a..9859feb9d90792 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py
@@ -15,7 +15,7 @@
 from __future__ import print_function
 
 import copy
-import gast
+from paddle.utils import gast
 
 from collections import defaultdict
 from paddle.fluid import unique_name
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/origin_info.py b/python/paddle/fluid/dygraph/dygraph_to_static/origin_info.py
index b2f4060b106828..0670c048c5e26b 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/origin_info.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/origin_info.py
@@ -17,7 +17,7 @@
 import collections
 import inspect
 
-import gast
+from paddle.utils import gast
 from paddle.fluid import core
 from paddle.fluid.dygraph.dygraph_to_static.utils import unwrap
 from paddle.fluid.framework import Program
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py b/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
index a99a5d50813719..e275ee04858f9a 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
@@ -17,7 +17,7 @@
 import six
 
 import paddle
-from paddle.fluid import framework, backward, core
+from paddle.fluid import framework, backward, core, program_guard
 from paddle.fluid.dygraph import layers
 from paddle.fluid.dygraph.base import switch_to_static_graph
 from paddle.fluid.dygraph.dygraph_to_static import logging_utils
@@ -26,6 +26,9 @@
 from paddle.fluid.layers.utils import pack_sequence_as
 from paddle.fluid.layers.utils import _hash_with_id
 from paddle.fluid.compiler import BuildStrategy
+from paddle.fluid.contrib.mixed_precision.decorator import AutoMixedPrecisionLists
+from paddle.fluid.contrib.mixed_precision.fp16_utils import rewrite_program
+from paddle.fluid.dygraph.amp.auto_cast import _in_amp_guard
 import paddle.compat as cpt
 from paddle import _C_ops
 
@@ -149,6 +152,9 @@ def __init__(self, main_program, inputs, outputs, parameters=None,
         self._double_grads = self._get_double_grads(self._origin_main_program)
         self.training = True
 
+        # For AMP training
+        self._amp_list = AutoMixedPrecisionLists()
+
     @LazyInitialized
     def _infer_program(self):
         """
@@ -168,6 +174,25 @@ def _train_program(self):
 
         return train_program
 
+    @LazyInitialized
+    @switch_to_static_graph
+    def _infer_amp_program(self):
+        """
+        Lazy initialized property of infer_amp_program.
+        """
+        infer_amp_program = self._origin_main_program.clone()
+        with program_guard(infer_amp_program):
+            rewrite_program(infer_amp_program, self._amp_list)
+
+        return infer_amp_program
+
+    @LazyInitialized
+    def _train_amp_program(self):
+        """
+        Lazy initialized property of train_amp_program.
+        """
+        return self._append_backward_desc(self._infer_amp_program)
+
     @LazyInitialized
     def _infer_program_id(self):
         return _hash_with_id(self._infer_program, self)
@@ -180,6 +205,14 @@ def _train_program_id(self):
 
         return program_id
 
+    @LazyInitialized
+    def _train_amp_program_id(self):
+        program_id = _hash_with_id(self._train_amp_program, self)
+        core._set_cached_executor_build_strategy(program_id,
+                                                 self._build_strategy)
+
+        return program_id
+
     def _verify_program(self, main_program):
         """
         Verify that the program parameter is initialized, prune some unused params,
@@ -241,12 +274,17 @@ def _get_double_grads(self, program):
                     double_grads.append(var_base)
         return self._valid_vars(double_grads)
 
+    def _get_end_op_index(self):
+        infer_program = self._infer_amp_program if _in_amp_guard(
+        ) else self._infer_program
+        return infer_program.desc.block(0).op_size()
+
     def __call__(self, inputs):
         in_vars, out_vars = self._prepare(inputs)
 
         attrs = ('global_block', self.program.desc.block(0), 'start_op_index',
-                 0, 'end_op_index', self._infer_program.desc.block(0).op_size(),
-                 'is_test', not self.training, 'program_id', self.program_id)
+                 0, 'end_op_index', self._get_end_op_index(), 'is_test',
+                 not self.training, 'program_id', self.program_id)
         _C_ops.run_program(
             self._valid_vars(in_vars),
             self._valid_vars(self._params),
@@ -258,11 +296,19 @@ def __call__(self, inputs):
 
     @property
     def program(self):
-        return self._train_program if self.training else self._infer_program
+        if self.training:
+            return self._train_amp_program if _in_amp_guard(
+            ) else self._train_program
+        else:
+            return self._infer_program
 
     @property
     def program_id(self):
-        return self._train_program_id if self.training else self._infer_program_id
+        if self.training:
+            return self._train_amp_program_id if _in_amp_guard(
+            ) else self._train_program_id
+        else:
+            return self._infer_program_id
 
     def _prepare(self, inputs):
         """
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/print_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/print_transformer.py
index 9d1ec35764b090..7960617369e3f2 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/print_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/print_transformer.py
@@ -14,7 +14,7 @@
 
 from __future__ import print_function
 
-import gast
+from paddle.utils import gast
 
 from paddle.fluid.dygraph.dygraph_to_static.static_analysis import AstNodeWrapper, StaticAnalysisVisitor
 
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
index 3664c4b0016449..58aac8e266fedd 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
@@ -15,7 +15,7 @@
 from __future__ import print_function
 
 import collections
-import gast
+from paddle.utils import gast
 import inspect
 import six
 import textwrap
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/return_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/return_transformer.py
index 4bcd49dc8e1577..0c7a8bf421a128 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/return_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/return_transformer.py
@@ -14,7 +14,7 @@
 
 from __future__ import print_function
 
-import gast
+from paddle.utils import gast
 
 from paddle.fluid import unique_name
 from paddle.fluid.dygraph.dygraph_to_static.utils import index_in_list
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/static_analysis.py b/python/paddle/fluid/dygraph/dygraph_to_static/static_analysis.py
index cbe6b8a0ff9428..ce5f50137b7aa9 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/static_analysis.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/static_analysis.py
@@ -14,7 +14,7 @@
 
 from __future__ import print_function
 
-import gast
+from paddle.utils import gast
 from .utils import is_paddle_api, is_dygraph_api, is_numpy_api, index_in_list
 
 __all__ = ['AstNodeWrapper', 'NodeVarType', 'StaticAnalysisVisitor']
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/tensor_shape_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/tensor_shape_transformer.py
index eb53d7ec9bec89..0bc167132e3ed7 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/tensor_shape_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/tensor_shape_transformer.py
@@ -15,7 +15,7 @@
 from __future__ import print_function
 
 import copy
-import gast
+from paddle.utils import gast
 
 from paddle.fluid import unique_name
 from paddle.fluid.dygraph.dygraph_to_static.utils import ast_to_source_code
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/utils.py b/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
index 351a9dcfa3aa2a..650857eefb3bb1 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
@@ -19,7 +19,7 @@
 import atexit
 import copy
 import collections
-import gast
+from paddle.utils import gast
 import inspect
 import os
 import six
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/variable_trans_func.py b/python/paddle/fluid/dygraph/dygraph_to_static/variable_trans_func.py
index c7844f160cee5a..b118eeadf7e7e5 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/variable_trans_func.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/variable_trans_func.py
@@ -15,7 +15,7 @@
 from __future__ import print_function
 
 import six
-import gast
+from paddle.utils import gast
 
 from paddle.fluid import core
 from paddle.fluid import unique_name
diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py
index de722e6e16c894..608e85acec3f27 100644
--- a/python/paddle/fluid/dygraph/nn.py
+++ b/python/paddle/fluid/dygraph/nn.py
@@ -1151,9 +1151,6 @@ def forward(self, input):
 
 class BatchNorm(layers.Layer):
     r"""
-    :alias_main: paddle.nn.BatchNorm
-	:alias: paddle.nn.BatchNorm,paddle.nn.layer.BatchNorm,paddle.nn.layer.norm.BatchNorm
-	:old_api: paddle.fluid.dygraph.BatchNorm
 
     This interface is used to construct a callable object of the ``BatchNorm`` class.
     For more details, refer to code examples.
@@ -1164,16 +1161,16 @@ class BatchNorm(layers.Layer):
     Internal Covariate Shift <https://arxiv.org/pdf/1502.03167.pdf>`_
     for more details.
 
-    When use_global_stats = False, the :math:`\\mu_{\\beta}` 
-    and :math:`\\sigma_{\\beta}^{2}` are the statistics of one mini-batch.
+    When use_global_stats = False, the :math:`\mu_{\beta}` 
+    and :math:`\sigma_{\beta}^{2}` are the statistics of one mini-batch.
     Calculated as follows:
 
     ..  math::
 
-        \\mu_{\\beta} &\\gets \\frac{1}{m} \\sum_{i=1}^{m} x_i \\qquad &//\\
-        \ mini-batch\ mean \\\\
-        \\sigma_{\\beta}^{2} &\\gets \\frac{1}{m} \\sum_{i=1}^{m}(x_i - \\
-        \\mu_{\\beta})^2 \\qquad &//\ mini-batch\ variance \\\\
+        \mu_{\beta} &\gets \frac{1}{m} \sum_{i=1}^{m} x_i \qquad &
+        //\ mini-batch\ mean \\
+        \sigma_{\beta}^{2} &\gets \frac{1}{m} \sum_{i=1}^{m}(x_i - \mu_{\beta})^2 \qquad &
+        //\ mini-batch\ variance \\
 
     - :math:`x` : mini-batch data
     - :math:`m` : the size of the mini-batch data
@@ -1191,13 +1188,14 @@ class BatchNorm(layers.Layer):
  
     ..  math::
 
-        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
-        \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\
-        y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
+        \hat{x_i} &\gets \frac{x_i - \mu_\beta} {\sqrt{\
+        \sigma_{\beta}^{2} + \epsilon}} \qquad &//\ normalize \\
+        y_i &\gets \gamma \hat{x_i} + \beta \qquad &//\ scale\ and\ shift
+
 
-    - :math:`\\epsilon` : add a smaller value to the variance to prevent division by zero
-    - :math:`\\gamma` : trainable proportional parameter
-    - :math:`\\beta` : trainable deviation parameter
+    - :math:`\epsilon` : add a smaller value to the variance to prevent division by zero
+    - :math:`\gamma` : trainable proportional parameter
+    - :math:`\beta` : trainable deviation parameter
 
     Parameters:
         num_channels(int): Indicate the number of channels of the input ``Tensor``.
@@ -3011,9 +3009,9 @@ class SpectralNorm(layers.Layer):
 
     .. math::
 
-        \mathbf{v} := \\frac{\mathbf{W}^{T} \mathbf{u}}{\|\mathbf{W}^{T} \mathbf{u}\|_2}
+        \mathbf{v} := \frac{\mathbf{W}^{T} \mathbf{u}}{\|\mathbf{W}^{T} \mathbf{u}\|_2}
 
-        \mathbf{u} := \\frac{\mathbf{W}^{T} \mathbf{v}}{\|\mathbf{W}^{T} \mathbf{v}\|_2}
+        \mathbf{u} := \frac{\mathbf{W}^{T} \mathbf{v}}{\|\mathbf{W}^{T} \mathbf{v}\|_2}
 
     Step 3:
     Calculate :math:`\sigma(\mathbf{W})` and normalize weight values.
@@ -3022,7 +3020,7 @@ class SpectralNorm(layers.Layer):
 
         \sigma(\mathbf{W}) = \mathbf{u}^{T} \mathbf{W} \mathbf{v}
 
-        \mathbf{W} = \\frac{\mathbf{W}}{\sigma(\mathbf{W})}
+        \mathbf{W} = \frac{\mathbf{W}}{\sigma(\mathbf{W})}
 
 
     Refer to `Spectral Normalization <https://arxiv.org/abs/1802.05957>`_ .
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 2247d49483035c..02f9fd1a95e2b2 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -2035,6 +2035,11 @@ def __init__(self,
                 del op_attrs[role_var_name]
 
             if len(self.desc.type()) != 0:
+                # NOTE(Aurelius84): prog.clone() will lead that var.op is always None,
+                # we add this to fix the problem.
+                for arg in self.desc.output_arg_names():
+                    if block.has_var(arg) and block.var(arg).op is None:
+                        block.var(arg).op = self
                 return
             if type is None:
                 raise ValueError(
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index cebb5e77ac636f..dc1e56f13f3b1d 100755
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -4890,6 +4890,7 @@ def split(input, num_or_sections, dim=-1, name=None):
         if isinstance(dim, Variable):
             dim = dim.numpy()
             dim = dim.item(0)
+        assert len(input.shape) + dim >= 0, "(rank(x) + axis) must >= 0"
         dim = (len(input.shape) + dim) if dim < 0 else dim
         attrs += ('axis', dim)
 
@@ -4951,6 +4952,7 @@ def _get_SectionsTensorList(one_list):
         dim.stop_gradient = True
         inputs['AxisTensor'] = dim
     else:
+        assert len(input.shape) + dim >= 0, "(rank(x) + axis) must >= 0"
         dim = (len(input_shape) + dim) if dim < 0 else dim
         attrs['axis'] = dim
 
@@ -7097,9 +7099,9 @@ def dice_loss(input, label, epsilon=0.00001, name=None):
 
     .. math::
 
-        dice\_loss &= 1 - \\frac{2 * intersection\_area}{total\_area} \\\\
-                  &= \\frac{(total\_area - intersection\_area) - intersection\_area}{total\_area} \\\\
-                  &= \\frac{(union\_area - intersection\_area)}{total\_area}
+        dice\_loss &= 1 - \frac{2 * intersection\_area}{total\_area} \\
+                  &= \frac{(total\_area - intersection\_area) - intersection\_area}{total\_area} \\
+                  &= \frac{(union\_area - intersection\_area)}{total\_area}
 
 
     Parameters:
@@ -13065,8 +13067,8 @@ def log_loss(input, label, epsilon=1e-4, name=None):
 
     .. math::
 
-        Out = -label * \\log{(input + \\epsilon)}
-              - (1 - label) * \\log{(1 - input + \\epsilon)}
+        Out = -label * \log{(input + \epsilon)}
+              - (1 - label) * \log{(1 - input + \epsilon)}
 
     Args:
         input (Tensor|list):  A 2-D tensor with shape [N x 1], where N is the
@@ -14500,17 +14502,17 @@ def unfold(x, kernel_sizes, strides=1, paddings=0, dilations=1, name=None):
 
     .. math::
 
-        dkernel[0] &= dilations[0] \\times (kernel\_sizes[0] - 1) + 1
+        dkernel[0] &= dilations[0] \times (kernel\_sizes[0] - 1) + 1
 
-        dkernel[1] &= dilations[1] \\times (kernel\_sizes[1] - 1) + 1
+        dkernel[1] &= dilations[1] \times (kernel\_sizes[1] - 1) + 1
 
-        hout &= \\frac{H + paddings[0] + paddings[2] - dkernel[0]}{strides[0]} + 1
+        hout &= \frac{H + paddings[0] + paddings[2] - dkernel[0]}{strides[0]} + 1
 
-        wout &= \\frac{W + paddings[1] + paddings[3] - dkernel[1]}{strides[1]} + 1
+        wout &= \frac{W + paddings[1] + paddings[3] - dkernel[1]}{strides[1]} + 1
 
-        Cout &= C \\times kernel\_sizes[0] \\times kernel\_sizes[1]
+        Cout &= C \times kernel\_sizes[0] \times kernel\_sizes[1]
 
-        Lout &= hout \\times wout
+        Lout &= hout \times wout
 
 
     Parameters:
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index ef168d2d921751..ab3dbad1ef326d 100755
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -6031,7 +6031,7 @@ def _offload(self, loss, startup_program=None):
         self._main_program = loss.block.program
         self.block = loss.block
         if startup_program == None:
-            startup_program = fluid.default_startup_program()
+            startup_program = paddle.static.default_startup_program()
 
         with program_guard(self._main_program, startup_program):
             assert len(self.checkpoint_shape) > 0, (
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index e7172507696ec0..007221ca4f9ca3 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -686,6 +686,8 @@ add_subdirectory(asp)
 
 add_subdirectory(ir)
 
+add_subdirectory(interpreter)
+
 if (WITH_TESTING)
     set_property(TEST test_parallel_executor_mnist PROPERTY ENVIRONMENT GLOG_vmodule=all_reduce_deps_pass=10)
     set_property(TEST test_parallel_executor_fix_op_run_order PROPERTY ENVIRONMENT GLOG_vmodule=fix_op_run_order_pass=10)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ast_util.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ast_util.py
index 62b6ac171a4c96..31a50226f0b79e 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ast_util.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ast_util.py
@@ -16,7 +16,7 @@
 
 import unittest
 import textwrap
-import gast
+from paddle.utils import gast
 import inspect
 import numpy as np
 import paddle.fluid as fluid
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_basic_api_transformation.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_basic_api_transformation.py
index ea745ad6614253..b86b85bb90ff69 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_basic_api_transformation.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_basic_api_transformation.py
@@ -17,7 +17,7 @@
 import numpy as np
 import unittest
 import inspect
-import gast
+from paddle.utils import gast
 
 import paddle
 import paddle.fluid as fluid
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_break_continue.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_break_continue.py
index 8423c056b2d830..95b5235aaa3d0a 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_break_continue.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_break_continue.py
@@ -184,7 +184,7 @@ def test_optim_break_in_while(x):
 
 class TestContinueInFor(unittest.TestCase):
     def setUp(self):
-        self.input = np.zeros((1)).astype('int32')
+        self.input = np.zeros((1)).astype('int64')
         self.place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda(
         ) else fluid.CPUPlace()
         self.init_dygraph_func()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse_basic.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse_basic.py
index 7ea6aa8907c282..975797a487be72 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse_basic.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse_basic.py
@@ -16,7 +16,7 @@
 
 import unittest
 import textwrap
-import gast
+from paddle.utils import gast
 from paddle.fluid.dygraph.dygraph_to_static.ifelse_transformer import get_name_ids
 from paddle.fluid.dygraph.dygraph_to_static.static_analysis import StaticAnalysisVisitor
 from paddle.fluid.dygraph.dygraph_to_static.static_analysis import NodeVarType
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_logging_utils.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_logging_utils.py
index 2ed2a273341805..385b7ce204a869 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_logging_utils.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_logging_utils.py
@@ -20,7 +20,7 @@
 import sys
 import unittest
 
-import gast
+from paddle.utils import gast
 
 import paddle
 from paddle.fluid.dygraph.dygraph_to_static import logging_utils
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_logical.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_logical.py
index c7193eb2a77bc8..b11e9441c8c0e2 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_logical.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_logical.py
@@ -18,7 +18,7 @@
 
 import unittest
 
-import gast
+from paddle.utils import gast
 import numpy as np
 
 import paddle
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_loop.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_loop.py
index fe86d5d636811e..8116c04f2034fe 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_loop.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_loop.py
@@ -14,7 +14,7 @@
 
 from __future__ import print_function
 
-import gast
+from paddle.utils import gast
 import inspect
 import numpy as np
 import paddle
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py
index 8a21c4cfd0eca8..cac64c7391351b 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py
@@ -32,6 +32,9 @@
 
 SEED = 2020
 
+if paddle.fluid.is_compiled_with_cuda():
+    paddle.fluid.set_flags({'FLAGS_cudnn_deterministic': True})
+
 
 class SimpleImgConvPool(fluid.dygraph.Layer):
     def __init__(self,
@@ -48,7 +51,7 @@ def __init__(self,
                  conv_dilation=1,
                  conv_groups=1,
                  act=None,
-                 use_cudnn=False,
+                 use_cudnn=True,
                  param_attr=None,
                  bias_attr=None):
         super(SimpleImgConvPool, self).__init__()
@@ -101,7 +104,6 @@ def __init__(self):
                     loc=0.0, scale=scale)),
             act="softmax")
 
-    @paddle.jit.to_static
     def forward(self, inputs, label=None):
         x = self.inference(inputs)
         if label is not None:
@@ -167,14 +169,14 @@ def test_mnist_declarative_cpu_vs_mkldnn(self):
                 dygraph_loss_cpu, dygraph_loss_mkldnn))
 
     def train(self, to_static=False):
-        prog_trans = ProgramTranslator()
-        prog_trans.enable(to_static)
 
         loss_data = []
         with fluid.dygraph.guard(self.place):
             fluid.default_main_program().random_seed = SEED
             fluid.default_startup_program().random_seed = SEED
             mnist = MNIST()
+            if to_static:
+                mnist = paddle.jit.to_static(mnist)
             adam = AdamOptimizer(
                 learning_rate=0.001, parameter_list=mnist.parameters())
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist_amp.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist_amp.py
new file mode 100644
index 00000000000000..d2160ca641665e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist_amp.py
@@ -0,0 +1,94 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import unittest
+import numpy as np
+from time import time
+from test_mnist import MNIST, TestMNIST, SEED
+from paddle.jit import ProgramTranslator
+from paddle.fluid.optimizer import AdamOptimizer
+
+if paddle.fluid.is_compiled_with_cuda():
+    paddle.fluid.set_flags({'FLAGS_cudnn_deterministic': True})
+
+
+class TestAMP(TestMNIST):
+    def train_static(self):
+        return self.train(to_static=True)
+
+    def train_dygraph(self):
+        return self.train(to_static=False)
+
+    def test_mnist_to_static(self):
+        dygraph_loss = self.train_dygraph()
+        static_loss = self.train_static()
+        # NOTE(Aurelius84): In static AMP training, there is a grep_list but
+        # dygraph AMP don't. It will bring the numbers of cast_op is different
+        # and leads to loss has a bit diff.
+        self.assertTrue(
+            np.allclose(
+                dygraph_loss, static_loss, atol=1e-3),
+            msg='dygraph is {}\n static_res is \n{}'.format(dygraph_loss,
+                                                            static_loss))
+
+    def train(self, to_static=False):
+        paddle.seed(SEED)
+        mnist = MNIST()
+
+        if to_static:
+            print("Successfully to apply @to_static.")
+            mnist = paddle.jit.to_static(mnist)
+
+        adam = AdamOptimizer(
+            learning_rate=0.001, parameter_list=mnist.parameters())
+
+        scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
+
+        loss_data = []
+        for epoch in range(self.epoch_num):
+            start = time()
+            for batch_id, data in enumerate(self.train_reader()):
+                dy_x_data = np.array([x[0].reshape(1, 28, 28)
+                                      for x in data]).astype('float32')
+                y_data = np.array(
+                    [x[1] for x in data]).astype('int64').reshape(-1, 1)
+
+                img = paddle.to_tensor(dy_x_data)
+                label = paddle.to_tensor(y_data)
+                label.stop_gradient = True
+
+                with paddle.amp.auto_cast():
+                    prediction, acc, avg_loss = mnist(img, label=label)
+
+                scaled = scaler.scale(avg_loss)
+                scaled.backward()
+                scaler.minimize(adam, scaled)
+
+                loss_data.append(avg_loss.numpy()[0])
+                # save checkpoint
+                mnist.clear_gradients()
+                if batch_id % 10 == 0:
+                    print(
+                        "Loss at epoch {} step {}: loss: {:}, acc: {}, cost: {}"
+                        .format(epoch, batch_id,
+                                avg_loss.numpy(), acc.numpy(), time() - start))
+                    start = time()
+                if batch_id == 50:
+                    break
+        return loss_data
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py
index 2ea3e369099109..9e12b6fa208505 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py
@@ -15,7 +15,7 @@
 from __future__ import print_function
 
 import astor
-import gast
+from paddle.utils import gast
 import inspect
 import numpy as np
 import textwrap
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_static_analysis.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_static_analysis.py
index 0fffb0c985375b..7f6d6cf1f3b005 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_static_analysis.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_static_analysis.py
@@ -14,7 +14,7 @@
 
 from __future__ import print_function
 
-import gast
+from paddle.utils import gast
 import inspect
 import numpy as np
 import paddle
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_variable_trans_func.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_variable_trans_func.py
index 9f677d765f9ab2..3431c6aac4cbef 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_variable_trans_func.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_variable_trans_func.py
@@ -14,7 +14,7 @@
 
 from __future__ import print_function
 
-import gast
+from paddle.utils import gast
 import unittest
 
 import numpy as np
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_transformer.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_transformer.py
index b336330836a66c..62b1a8b1da6797 100644
--- a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_transformer.py
+++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_transformer.py
@@ -54,13 +54,17 @@ def forward(self, x):
         attention_mask = paddle.tensor.triu(
             (paddle.ones(
                 (length, length), dtype="float32") * -1e9), 1)
-        attention_mask.stop_gradient = True
+
+        no_used = paddle.ones((3, 3), dtype="int32")
+
         w_emb = self.word_embeddings(x)
         p_emb = self.position_embeddings(x)
         w_emb = w_emb + p_emb
 
+        attention_mask.stop_gradient = True
+        no_used.stop_gradient = True
         # need to fix bug of backward()
-        return w_emb, attention_mask
+        return w_emb, attention_mask, no_used, p_emb
 
 
 class TransformerNet(Layer):
@@ -99,12 +103,12 @@ def forward(self, x):
 
 class TransformerNetPipe(TransformerNet):
     def forward(self, args):
-        x, mask = args[0], args[1]
+        x, mask, no_used, p_emb = args[0], args[1], args[2], args[3]
 
         output = super().forward(x, mask)
-        output = output
+        output = output + p_emb
         mask.stop_gradient = True
-        return output, mask
+        return output, mask, no_used, p_emb
 
 
 class CriterionPipe(Layer):
@@ -175,6 +179,8 @@ def test_pp_model(self):
             loss = model.train_batch([x, x], optimizer, scheduler)
             # TODO(shenliang03) add utest for loss
 
+            print("loss: ", loss)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/interpreter/CMakeLists.txt b/python/paddle/fluid/tests/unittests/interpreter/CMakeLists.txt
new file mode 100644
index 00000000000000..7692f8befdf58c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/interpreter/CMakeLists.txt
@@ -0,0 +1,6 @@
+file(GLOB TEST_INTERP_CASES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+string(REPLACE ".py" "" TEST_INTERP_CASES "${TEST_INTERP_CASES}")
+
+foreach(target ${TEST_INTERP_CASES})
+  py_test_modules(${target} MODULES ${target})
+endforeach()
diff --git a/python/paddle/fluid/tests/unittests/interpreter/test_interpreter.py b/python/paddle/fluid/tests/unittests/interpreter/test_interpreter.py
new file mode 100644
index 00000000000000..bb18d28e48b67d
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/interpreter/test_interpreter.py
@@ -0,0 +1,55 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+from paddle.fluid import core
+from paddle.fluid.core import InterpreterCore
+
+import numpy as np
+
+paddle.enable_static()
+
+
+class LinearTestCase(unittest.TestCase):
+    def setUp(self):
+        self.place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda(
+        ) else paddle.CPUPlace()
+
+    def test_interp_base(self):
+        a = paddle.static.data(name="a", shape=[2, 2], dtype='float32')
+        b = paddle.ones([2, 2]) * 2
+        t = paddle.static.nn.fc(a, 2)
+        c = t + b
+
+        main_program = paddle.fluid.default_main_program()
+        startup_program = paddle.fluid.default_startup_program()
+        p = core.Place()
+        p.set_place(self.place)
+        inter_core = InterpreterCore(p, main_program.desc, startup_program.desc,
+                                     core.Scope())
+
+        out = inter_core.run({
+            "a": np.ones(
+                [2, 2], dtype="float32") * 2
+        }, [c.name])
+        for i in range(10):
+            out = inter_core.run({
+                "a": np.ones(
+                    [2, 2], dtype="float32") * i
+            }, [c.name])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_reshape_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_reshape_op.py
index 85054be534eeba..76dc605c3ecd27 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_reshape_op.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_reshape_op.py
@@ -80,6 +80,33 @@ def setUp(self):
 
 
 class TRTReshapeTest2(TRTReshapeTest):
+    def setUp(self):
+        self.bs = 2
+        self.input_shape = [23, 13, 24]
+        self.reshape = [2, 0, -1, 12]
+        self.data_shape = [
+            self.bs, self.input_shape[0], self.input_shape[1],
+            self.input_shape[2]
+        ]
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name='data', shape=self.data_shape, dtype='float32')
+            actual_reshape = fluid.data(
+                name='actual_reshape', shape=[4], dtype='int32')
+            reshape_out = fluid.layers.reshape(
+                x=data, shape=self.reshape, actual_shape=actual_reshape)
+            out = fluid.layers.batch_norm(reshape_out, is_test=True)
+        self.feeds = {
+            'data': np.random.random(self.data_shape).astype('float32'),
+            'actual_reshape': np.array([2, 0, -1, 12]).astype('int32')
+        }
+        self.enable_trt = True
+        self.trt_parameters = TRTReshapeTest.TensorRTParam(
+            1 << 30, self.bs, 1, AnalysisConfig.Precision.Float32, False, False)
+        self.fetch_list = [out]
+
+
+class TRTReshapeTest3(TRTReshapeTest):
     def setUp(self):
         self.bs = 1
         self.input_shape = [14, 48, 27]
diff --git a/python/paddle/fluid/tests/unittests/npu/test_atan_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_atan_op_npu.py
new file mode 100644
index 00000000000000..a18b8a03075ef8
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_atan_op_npu.py
@@ -0,0 +1,87 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+SEED = 1024
+
+
+class TestAtan(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "atan"
+        self.place = paddle.NPUPlace(0)
+
+        self.dtype = np.float32
+        np.random.seed(SEED)
+        self.shape = [11, 17]
+        x = np.random.uniform(0.1, 1, self.shape).astype(self.dtype)
+        out = np.arctan(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
+
+    def set_attrs(self):
+        pass
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def test_check_grad(self):
+        self.check_grad_with_place(self.place, ['X'], 'Out')
+
+    def test_out_name(self):
+        with fluid.program_guard(fluid.Program()):
+            np_x = np.array([0.1])
+            data = fluid.layers.data(name="X", shape=[1])
+            out = paddle.atan(data, name='Y')
+            place = paddle.NPUPlace(0)
+            exe = fluid.Executor(place)
+            result, = exe.run(feed={"X": np_x}, fetch_list=[out])
+            expected = np.arctan(np_x)
+            self.assertEqual(result, expected)
+
+    def test_dygraph(self):
+        with fluid.dygraph.guard(paddle.NPUPlace(0)):
+            np_x = np.array([0.1])
+            x = fluid.dygraph.to_variable(np_x)
+            z = paddle.atan(x).numpy()
+            z_expected = np.arctan(np_x)
+            self.assertEqual(z, z_expected)
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+
+class TestAtanShape(TestAtan):
+    def set_attrs(self):
+        self.shape = [12, 23, 10]
+
+
+class TestAtanFloat16(TestAtan):
+    def set_attrs(self):
+        self.dtype = np.float16
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_compare_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_compare_op_npu.py
index d8c22e2da09077..66ce81756fc9d8 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_compare_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_compare_op_npu.py
@@ -142,11 +142,12 @@ def test_attr_name(self):
     globals()[cls_name] = Cls
 
 
-for _type_name in {'float16', 'float32', 'int32'}:
-    if _type_name == 'int32':
+for _type_name in {'float16', 'float32', 'int32', 'int64', 'bool'}:
+    if _type_name == 'int32' or _type_name == 'bool':
         create_test_class('equal', _type_name, lambda _a, _b: _a == _b)
         continue
     create_test_class('equal', _type_name, lambda _a, _b: _a == _b)
+    create_test_class('not_equal', _type_name, lambda _a, _b: _a != _b)
     create_test_class('less_than', _type_name, lambda _a, _b: _a < _b)
     create_test_class('less_equal', _type_name, lambda _a, _b: _a <= _b)
     create_test_class('greater_than', _type_name, lambda _a, _b: _a > _b)
diff --git a/python/paddle/fluid/tests/unittests/npu/test_cos_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_cos_op_npu.py
new file mode 100644
index 00000000000000..9b29fc812faedd
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_cos_op_npu.py
@@ -0,0 +1,146 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+SEED = 2021
+
+
+class TestCos(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "cos"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
+        out = np.cos(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, atol=1e-7)
+
+    def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
+        self.check_grad_with_place(self.place, ['X'], 'Out')
+
+
+class TestCosFp16(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "cos"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.uniform(1, 2, [3, 4]).astype(self.dtype)
+        out = np.cos(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.attrs = {}
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.__class__.no_need_check_grad = True
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+
+class TestCosNet(unittest.TestCase):
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(32, 32)).astype('float32')
+        b_np = np.random.random(size=(32, 32)).astype('float32')
+        label_np = np.random.randint(2, size=(32, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
+            b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
+            label = paddle.static.data(
+                name="label", shape=[32, 1], dtype='int64')
+
+            c = paddle.multiply(a, b)
+            d = paddle.cos(c)
+
+            fc_1 = fluid.layers.fc(input=d, size=128)
+            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
+
+            cost = fluid.layers.cross_entropy(input=prediction, label=label)
+            loss = fluid.layers.reduce_mean(cost)
+            sgd = fluid.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+
+        if run_npu:
+            place = paddle.NPUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        print("Start run on {}".format(place))
+        for epoch in range(100):
+
+            pred_res, loss_res = exe.run(
+                main_prog,
+                feed={"a": a_np,
+                      "b": b_np,
+                      "label": label_np},
+                fetch_list=[prediction, loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                    epoch, pred_res[0], loss_res))
+
+        return pred_res, loss_res
+
+    def test_npu(self):
+        cpu_pred, cpu_loss = self._test(False)
+        npu_pred, npu_loss = self._test(True)
+
+        self.assertTrue(np.allclose(npu_pred, cpu_pred))
+        self.assertTrue(np.allclose(npu_loss, cpu_loss))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_add_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_add_op_npu.py
index 5288db5ceb1c6f..9b27e75e37d255 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_elementwise_add_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_add_op_npu.py
@@ -13,14 +13,16 @@
 # limitations under the License.
 
 from __future__ import print_function
-
 import numpy as np
 import unittest
 import sys
 sys.path.append("..")
-from op_test import OpTest, _set_use_system_allocator
-import paddle
+
+from paddle.fluid import Program, program_guard
+import paddle.fluid.core as core
 import paddle.fluid as fluid
+import paddle
+from op_test import OpTest, skip_check_grad_ci
 
 paddle.enable_static()
 
@@ -63,6 +65,9 @@ def test_check_output(self):
         self.check_output_with_place(self.place)
 
     def test_check_grad_normal(self):
+        if self.dtype == np.float16:
+            return
+
         self.check_grad_with_place(
             self.place,
             ['X', 'Y'],
@@ -70,6 +75,9 @@ def test_check_grad_normal(self):
             max_relative_error=0.006, )
 
     def test_check_grad_ingore_x(self):
+        if self.dtype == np.float16:
+            return
+
         self.check_grad_with_place(
             self.place,
             ['Y'],
@@ -78,6 +86,9 @@ def test_check_grad_ingore_x(self):
             max_relative_error=0.006, )
 
     def test_check_grad_ingore_y(self):
+        if self.dtype == np.float16:
+            return
+
         self.check_grad_with_place(
             self.place,
             ['X'],
@@ -86,6 +97,47 @@ def test_check_grad_ingore_y(self):
             max_relative_error=0.006, )
 
 
+class TestFP16ElementwiseAddOp(TestElementwiseAddOp):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
+@skip_check_grad_ci(
+    reason="[skip shape check] Use y_shape(1) to test broadcast.")
+class TestElementwiseAddOp_scalar(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(1).astype(self.dtype)
+        self.out = self.x + self.y
+
+
+@skip_check_grad_ci(
+    reason="[skip shape check] Use y_shape(1) to test broadcast.")
+class TestFP16ElementwiseAddOp_scalar(TestFP16ElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(1).astype(self.dtype)
+        self.out = self.x + self.y
+
+
+@skip_check_grad_ci(
+    reason="[skip shape check] Use y_shape(1,1) to test broadcast.")
+class TestElementwiseAddOp_scalar2(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(1, 1).astype(self.dtype)
+        self.out = self.x + self.y
+
+
+@skip_check_grad_ci(
+    reason="[skip shape check] Use y_shape(1,1) to test broadcast.")
+class TestFP16ElementwiseAddOp_scalar2(TestFP16ElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 4).astype(self.dtype)
+        self.y = np.random.rand(1, 1).astype(self.dtype)
+        self.out = self.x + self.y
+
+
 class TestAddAPI(unittest.TestCase):
     def test_name(self):
         with paddle.static.program_guard(paddle.static.Program()):
@@ -148,5 +200,385 @@ def test_errors(self):
             self.assertRaises(TypeError, paddle.add, x2, y2)
 
 
+class TestElementwiseAddOp_Vector(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.random((100, )).astype(self.dtype)
+        self.y = np.random.random((100, )).astype(self.dtype)
+        self.out = np.add(self.x, self.y)
+
+
+class TestFP16ElementwiseAddOp_Vector(TestFP16ElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.random((100, )).astype(self.dtype)
+        self.y = np.random.random((100, )).astype(self.dtype)
+        self.out = np.add(self.x, self.y)
+
+
+class TestElementwiseAddOp_broadcast_0(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(100, 2, 3).astype(self.dtype)
+        self.y = np.random.rand(100).astype(self.dtype)
+        self.out = self.x + self.y.reshape(100, 1, 1)
+
+    def init_axis(self):
+        self.axis = 0
+
+
+class TestFP16ElementwiseAddOp_broadcast_0(TestFP16ElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(100, 2, 3).astype(self.dtype)
+        self.y = np.random.rand(100).astype(self.dtype)
+        self.out = self.x + self.y.reshape(100, 1, 1)
+
+    def init_axis(self):
+        self.axis = 0
+
+
+class TestElementwiseAddOp_broadcast_1(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 100, 3).astype(self.dtype)
+        self.y = np.random.rand(100).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 100, 1)
+
+    def init_axis(self):
+        self.axis = 1
+
+
+class TestFP16ElementwiseAddOp_broadcast_1(TestFP16ElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 100, 3).astype(self.dtype)
+        self.y = np.random.rand(100).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 100, 1)
+
+    def init_axis(self):
+        self.axis = 1
+
+
+class TestElementwiseAddOp_broadcast_2(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 100).astype(self.dtype)
+        self.y = np.random.rand(100).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 1, 100)
+
+
+class TestFP16ElementwiseAddOp_broadcast_2(TestFP16ElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 100).astype(self.dtype)
+        self.y = np.random.rand(100).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 1, 100)
+
+
+class TestElementwiseAddOp_broadcast_3(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 10, 12, 1).astype(self.dtype)
+        self.y = np.random.rand(10, 12).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 10, 12, 1)
+
+    def init_axis(self):
+        self.axis = 1
+
+
+class TestFP16ElementwiseAddOp_broadcast_3(TestFP16ElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 10, 12, 3).astype(self.dtype)
+        self.y = np.random.rand(10, 12).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 10, 12, 1)
+
+    def init_axis(self):
+        self.axis = 1
+
+
+class TestElementwiseAddOp_broadcast_4(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(100, 2, 1, 2).astype(self.dtype)
+        self.y = np.random.rand(100, 1).astype(self.dtype)
+        self.out = self.x + self.y.reshape(100, 1, 1, 1)
+
+    def init_axis(self):
+        self.axis = 0
+
+
+class TestFP16ElementwiseAddOp_broadcast_4(TestFP16ElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(100, 2, 1, 2).astype(self.dtype)
+        self.y = np.random.rand(100, 1).astype(self.dtype)
+        self.out = self.x + self.y.reshape(100, 1, 1, 1)
+
+    def init_axis(self):
+        self.axis = 0
+
+
+class TestElementwiseAddOp_broadcast_5(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(10, 3, 12).astype(self.dtype)
+        self.y = np.random.rand(10, 1, 12).astype(self.dtype)
+        self.out = self.x + self.y
+
+
+class TestFP16ElementwiseAddOp_broadcast_5(TestFP16ElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(10, 3, 12).astype(self.dtype)
+        self.y = np.random.rand(10, 1, 12).astype(self.dtype)
+        self.out = self.x + self.y
+
+
+class TestElementwiseAddOp_broadcast_6(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 12, 3, 5).astype(self.dtype)
+        self.y = np.random.rand(2, 12, 1, 5).astype(self.dtype)
+        self.out = self.x + self.y
+
+
+class TestElementwiseAddOp_broadcast_7(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(1, 1, 20, 5).astype(self.dtype)
+        self.y = np.random.rand(20, 5, 1, 1).astype(self.dtype)
+        self.out = self.x + self.y
+
+
+class TestFP16ElementwiseAddOp_broadcast_6(TestFP16ElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 12, 3, 5).astype(self.dtype)
+        self.y = np.random.rand(2, 12, 1, 5).astype(self.dtype)
+        self.out = self.x + self.y
+
+
+class TestElementwiseAddOp_rowwise_add_0(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 10, 12).astype(self.dtype)
+        self.y = np.random.rand(10, 12).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 10, 12)
+
+    def init_axis(self):
+        self.axis = 1
+
+
+class TestFP16ElementwiseAddOp_rowwise_add_0(TestFP16ElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 10, 12).astype(self.dtype)
+        self.y = np.random.rand(10, 12).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 10, 12)
+
+    def init_axis(self):
+        self.axis = 1
+
+
+@skip_check_grad_ci(
+    reason="[skip shape check] Use y_shape(1) to test broadcast.")
+class TestElementwiseAddOp_rowwise_add_1(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(100, 1).astype(self.dtype)
+        self.y = np.random.rand(1).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 1)
+
+    def init_axis(self):
+        self.axis = 1
+
+
+@skip_check_grad_ci(
+    reason="[skip shape check] Use y_shape(1) to test broadcast.")
+class TestFP16ElementwiseAddOp_rowwise_add_1(TestFP16ElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(100, 1).astype(self.dtype)
+        self.y = np.random.rand(1).astype(self.dtype)
+        self.out = self.x + self.y.reshape(1, 1)
+
+    def init_axis(self):
+        self.axis = 1
+
+
+class TestElementwiseAddOp_channelwise_add(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(100, 2, 3).astype(self.dtype)
+        self.y = np.random.rand(100, 1, 1).astype(self.dtype)
+        self.out = self.x + self.y
+
+    def init_axis(self):
+        self.axis = -1
+
+
+class TestFP16ElementwiseAddOp_channelwise_add(TestFP16ElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(100, 2, 3).astype(self.dtype)
+        self.y = np.random.rand(100, 1, 1).astype(self.dtype)
+        self.out = self.x + self.y
+
+    def init_axis(self):
+        self.axis = -1
+
+
+class TestElementwiseAddOp_commonuse_add1(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 100).astype(self.dtype)
+        self.y = np.random.rand(1, 1, 100).astype(self.dtype)
+        self.out = self.x + self.y
+
+    def init_axis(self):
+        self.axis = -1
+
+
+class TestElementwiseFP16AddOp_commonuse_add1(TestFP16ElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(2, 3, 100).astype(self.dtype)
+        self.y = np.random.rand(1, 1, 100).astype(self.dtype)
+        self.out = self.x + self.y
+
+    def init_axis(self):
+        self.axis = -1
+
+
+class TestElementwiseAddOp_commonuse_add2(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(10, 3, 1, 4).astype(self.dtype)
+        self.y = np.random.rand(10, 1, 12, 1).astype(self.dtype)
+        self.out = self.x + self.y
+
+    def init_axis(self):
+        self.axis = -1
+
+
+class TestElementwiseAddOp_xsize_lessthan_ysize_add(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(10, 12).astype(self.dtype)
+        self.y = np.random.rand(2, 2, 10, 12).astype(self.dtype)
+        self.out = self.x + self.y
+
+    def init_axis(self):
+        self.axis = 2
+
+
+class TestElementwiseAddOp_same_shape_ysize_large(TestElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.rand(10, 1, 12).astype(self.dtype)
+        self.y = np.random.rand(10, 2, 12).astype(self.dtype)
+        self.out = self.x + self.y
+
+    def init_axis(self):
+        self.axis = 0
+
+
+class TestElementwiseAddOpError(unittest.TestCase):
+    def test_errors(self):
+        with program_guard(Program(), Program()):
+            # the input of elementwise_add must be Variable.
+            x1 = fluid.create_lod_tensor(
+                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.NPUPlace(0))
+            y1 = fluid.create_lod_tensor(
+                np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.NPUPlace(0))
+            self.assertRaises(TypeError, fluid.layers.elementwise_add, x1, y1)
+
+            # the input dtype of elementwise_add must be float16 or float32 or float64 or int32 or int64
+            # float16 only can be set on GPU place
+            x2 = fluid.layers.data(name='x2', shape=[3, 4, 5, 6], dtype="uint8")
+            y2 = fluid.layers.data(name='y2', shape=[3, 4, 5, 6], dtype="uint8")
+            self.assertRaises(TypeError, fluid.layers.elementwise_add, x2, y2)
+
+
+class TestAddApi(unittest.TestCase):
+    def _executed_api(self, x, y, name=None):
+        return paddle.add(x, y, name)
+
+    def test_name(self):
+        with fluid.program_guard(fluid.Program()):
+            x = fluid.data(name="x", shape=[2, 3], dtype="float32")
+            y = fluid.data(name='y', shape=[2, 3], dtype='float32')
+
+            y_1 = self._executed_api(x, y, name='add_res')
+            self.assertEqual(('add_res' in y_1.name), True)
+
+    def test_declarative(self):
+        with fluid.program_guard(fluid.Program()):
+
+            def gen_data():
+                return {
+                    "x": np.array([2, 3, 4]).astype('float32'),
+                    "y": np.array([1, 5, 2]).astype('float32')
+                }
+
+            x = fluid.data(name="x", shape=[3], dtype='float32')
+            y = fluid.data(name="y", shape=[3], dtype='float32')
+            z = self._executed_api(x, y)
+
+            place = fluid.NPUPlace(0)
+            exe = fluid.Executor(place)
+            z_value = exe.run(feed=gen_data(), fetch_list=[z.name])
+            z_expected = np.array([3., 8., 6.])
+            self.assertEqual((z_value == z_expected).all(), True)
+
+    def test_dygraph(self):
+        with fluid.dygraph.guard(paddle.NPUPlace(0)):
+            np_x = np.array([2, 3, 4]).astype('float64')
+            np_y = np.array([1, 5, 2]).astype('float64')
+            x = fluid.dygraph.to_variable(np_x)
+            y = fluid.dygraph.to_variable(np_y)
+            z = self._executed_api(x, y)
+            np_z = z.numpy()
+            z_expected = np.array([3., 8., 6.])
+            self.assertEqual((np_z == z_expected).all(), True)
+
+
+class TestAddInplaceApi(TestAddApi):
+    def _executed_api(self, x, y, name=None):
+        return x.add_(y, name)
+
+
+class TestAddInplaceBroadcastSuccess(unittest.TestCase):
+    def init_data(self):
+        self.x_numpy = np.random.rand(2, 3, 4).astype('float')
+        self.y_numpy = np.random.rand(3, 4).astype('float')
+
+    def test_broadcast_success(self):
+        paddle.disable_static(place=paddle.NPUPlace(0))
+        self.init_data()
+        x = paddle.to_tensor(self.x_numpy)
+        y = paddle.to_tensor(self.y_numpy)
+        inplace_result = x.add_(y)
+        numpy_result = self.x_numpy + self.y_numpy
+        self.assertEqual((inplace_result.numpy() == numpy_result).all(), True)
+        paddle.enable_static()
+
+
+class TestAddInplaceBroadcastSuccess2(TestAddInplaceBroadcastSuccess):
+    def init_data(self):
+        self.x_numpy = np.random.rand(1, 2, 3, 1).astype('float')
+        self.y_numpy = np.random.rand(3, 1).astype('float')
+
+
+class TestAddInplaceBroadcastSuccess3(TestAddInplaceBroadcastSuccess):
+    def init_data(self):
+        self.x_numpy = np.random.rand(2, 3, 1, 5).astype('float')
+        self.y_numpy = np.random.rand(1, 3, 1, 5).astype('float')
+
+
+class TestAddInplaceBroadcastError(unittest.TestCase):
+    def init_data(self):
+        self.x_numpy = np.random.rand(3, 4).astype('float')
+        self.y_numpy = np.random.rand(2, 3, 4).astype('float')
+
+    def test_broadcast_errors(self):
+        paddle.disable_static(place=paddle.NPUPlace(0))
+        self.init_data()
+        x = paddle.to_tensor(self.x_numpy)
+        y = paddle.to_tensor(self.y_numpy)
+
+        def broadcast_shape_error():
+            x.add_(y)
+
+        self.assertRaises(ValueError, broadcast_shape_error)
+        paddle.enable_static()
+
+
+class TestAddInplaceBroadcastError2(TestAddInplaceBroadcastError):
+    def init_data(self):
+        self.x_numpy = np.random.rand(2, 1, 4).astype('float')
+        self.y_numpy = np.random.rand(2, 3, 4).astype('float')
+
+
+class TestAddInplaceBroadcastError3(TestAddInplaceBroadcastError):
+    def init_data(self):
+        self.x_numpy = np.random.rand(5, 2, 1, 4).astype('float')
+        self.y_numpy = np.random.rand(2, 3, 4).astype('float')
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_expand_as_v2_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_expand_as_v2_op_npu.py
new file mode 100644
index 00000000000000..99edc25f7696a4
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_expand_as_v2_op_npu.py
@@ -0,0 +1,146 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+np.random.seed(10)
+
+
+class TestExpandAsOpRank1(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.place = paddle.NPUPlace(0)
+        self.op_type = "expand_as_v2"
+        x = np.random.rand(100).astype("float32")
+        target_tensor = np.random.rand(2, 100).astype("float32")
+        self.inputs = {'X': x}
+        self.attrs = {'target_shape': target_tensor.shape}
+        bcast_dims = [2, 1]
+        output = np.tile(self.inputs['X'], bcast_dims)
+        self.outputs = {'Out': output}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad(self):
+        pass
+
+
+class TestExpandAsOpRank2(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.place = paddle.NPUPlace(0)
+        self.op_type = "expand_as_v2"
+        x = np.random.rand(10, 12).astype("float32")
+        target_tensor = np.random.rand(10, 12).astype("float32")
+        self.inputs = {'X': x}
+        self.attrs = {'target_shape': target_tensor.shape}
+        bcast_dims = [1, 1]
+        output = np.tile(self.inputs['X'], bcast_dims)
+        self.outputs = {'Out': output}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad(self):
+        pass
+
+
+class TestExpandAsOpRank3(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.place = paddle.NPUPlace(0)
+        self.op_type = "expand_as_v2"
+        x = np.random.rand(2, 3, 20).astype("float32")
+        target_tensor = np.random.rand(2, 3, 20).astype("float32")
+        self.inputs = {'X': x}
+        self.attrs = {'target_shape': target_tensor.shape}
+        bcast_dims = [1, 1, 1]
+        output = np.tile(self.inputs['X'], bcast_dims)
+        self.outputs = {'Out': output}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad(self):
+        pass
+
+
+class TestExpandAsOpRank4(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.place = paddle.NPUPlace(0)
+        self.op_type = "expand_as_v2"
+        x = np.random.rand(1, 1, 7, 16).astype("float32")
+        target_tensor = np.random.rand(4, 6, 7, 16).astype("float32")
+        self.inputs = {'X': x}
+        self.attrs = {'target_shape': target_tensor.shape}
+        bcast_dims = [4, 6, 1, 1]
+        output = np.tile(self.inputs['X'], bcast_dims)
+        self.outputs = {'Out': output}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad(self):
+        pass
+
+
+# Test python API
+class TestExpandAsV2API(unittest.TestCase):
+    def test_api(self):
+        input1 = np.random.random([12, 14]).astype("float32")
+        input2 = np.random.random([2, 12, 14]).astype("float32")
+        x = fluid.layers.data(
+            name='x', shape=[12, 14], append_batch_size=False, dtype="float32")
+
+        y = fluid.layers.data(
+            name='target_tensor',
+            shape=[2, 12, 14],
+            append_batch_size=False,
+            dtype="float32")
+
+        out_1 = paddle.expand_as(x, y=y)
+
+        exe = fluid.Executor(place=fluid.NPUPlace(0))
+        res_1 = exe.run(fluid.default_main_program(),
+                        feed={"x": input1,
+                              "target_tensor": input2},
+                        fetch_list=[out_1])
+        assert np.array_equal(res_1[0], np.tile(input1, (2, 1, 1)))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_eye_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_eye_op_npu.py
new file mode 100755
index 00000000000000..abe981399a9626
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_eye_op_npu.py
@@ -0,0 +1,195 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import core
+import paddle.fluid.framework as framework
+
+paddle.enable_static()
+np.random.seed(10)
+
+
+class TestEyeOp(OpTest):
+    def setUp(self):
+        '''
+	    Test eye op with specified shape
+        '''
+        self.set_npu()
+        self.place = paddle.NPUPlace(0)
+        self.op_type = "eye"
+        self.inputs = {}
+
+        self.num_rows = 0
+        self.num_columns = 0
+        self.dtype = np.float32
+
+        self.initTestCase()
+
+        if self.num_columns == 0:
+            self.attrs = {
+                'num_rows': self.num_rows,
+                'dtype': framework.convert_np_dtype_to_dtype_(self.dtype)
+            }
+            self.outputs = {'Out': np.eye(self.num_rows, dtype=self.dtype)}
+        else:
+            self.attrs = {
+                'num_rows': self.num_rows,
+                'num_columns': self.num_columns,
+                'dtype': framework.convert_np_dtype_to_dtype_(self.dtype)
+            }
+            self.outputs = {
+                'Out': np.eye(self.num_rows, self.num_columns, dtype=self.dtype)
+            }
+
+    def initTestCase(self):
+        self.num_rows = 219
+        self.num_columns = 319
+        self.dtype = np.int32
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+
+class TestEyeOp1(TestEyeOp):
+    def initTestCase(self):
+        self.num_rows = 50
+
+
+class TestEyeOp2(TestEyeOp):
+    def initTestCase(self):
+        self.num_rows = 50
+        self.dtype = np.int32
+
+
+class TestEyeOp3(TestEyeOp):
+    def initTestCase(self):
+        self.num_rows = 50
+        self.dtype = np.float16
+
+
+class TestEyeOp4(TestEyeOp):
+    def initTestCase(self):
+        self.num_rows = 1
+        self.num_columns = 99
+
+
+class TestEyeOp5(TestEyeOp):
+    def initTestCase(self):
+        self.num_rows = 100
+        self.num_columns = 100
+
+
+class TestEyeOp6(TestEyeOp):
+    def initTestCase(self):
+        self.num_rows = 100
+        self.num_columns = 100
+        self.dtype = np.float32
+
+
+class API_TestTensorEye(unittest.TestCase):
+    def test_out(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            data = paddle.eye(10)
+            place = paddle.NPUPlace(0)
+            exe = paddle.static.Executor(place)
+            result, = exe.run(fetch_list=[data])
+            expected_result = np.eye(10, dtype="float32")
+        self.assertEqual((result == expected_result).all(), True)
+
+        with paddle.static.program_guard(paddle.static.Program()):
+            data = paddle.eye(10, num_columns=7, dtype="float16")
+            place = paddle.NPUPlace(0)
+            exe = paddle.static.Executor(place)
+            result, = exe.run(fetch_list=[data])
+            expected_result = np.eye(10, 7, dtype="float16")
+        self.assertEqual((result == expected_result).all(), True)
+
+        with paddle.static.program_guard(paddle.static.Program()):
+            data = paddle.eye(10, dtype="int32")
+            place = paddle.NPUPlace(0)
+            exe = paddle.static.Executor(place)
+            result, = exe.run(fetch_list=[data])
+            expected_result = np.eye(10, dtype="int32")
+        self.assertEqual((result == expected_result).all(), True)
+
+        paddle.disable_static(paddle.NPUPlace(0))
+        out = paddle.eye(10, dtype="int32")
+        expected_result = np.eye(10, dtype="int32")
+        paddle.enable_static()
+        self.assertEqual((out.numpy() == expected_result).all(), True)
+
+        paddle.disable_static(paddle.NPUPlace(0))
+        batch_shape = [2]
+        out = fluid.layers.eye(10, 10, dtype="int32", batch_shape=batch_shape)
+        result = np.eye(10, dtype="int32")
+        expected_result = []
+        for index in reversed(batch_shape):
+            tmp_result = []
+            for i in range(index):
+                tmp_result.append(result)
+            result = tmp_result
+            expected_result = np.stack(result, axis=0)
+        paddle.enable_static()
+        self.assertEqual(out.numpy().shape == np.array(expected_result).shape,
+                         True)
+        self.assertEqual((out.numpy() == expected_result).all(), True)
+
+        paddle.disable_static(paddle.NPUPlace(0))
+        batch_shape = [3, 2]
+        out = fluid.layers.eye(10, 10, dtype="int32", batch_shape=batch_shape)
+        result = np.eye(10, dtype="int32")
+        expected_result = []
+        for index in reversed(batch_shape):
+            tmp_result = []
+            for i in range(index):
+                tmp_result.append(result)
+            result = tmp_result
+            expected_result = np.stack(result, axis=0)
+        paddle.enable_static()
+        self.assertEqual(out.numpy().shape == np.array(expected_result).shape,
+                         True)
+        self.assertEqual((out.numpy() == expected_result).all(), True)
+
+    def test_errors(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+
+            def test_num_rows_type_check():
+                paddle.eye(-1, dtype="int64")
+
+            self.assertRaises(TypeError, test_num_rows_type_check)
+
+            def test_num_columns_type_check():
+                paddle.eye(10, num_columns=5.2, dtype="int64")
+
+            self.assertRaises(TypeError, test_num_columns_type_check)
+
+            def test_num_columns_type_check1():
+                paddle.eye(10, num_columns=10, dtype="int8")
+
+            self.assertRaises(TypeError, test_num_columns_type_check1)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_fill_any_like_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_fill_any_like_op_npu.py
new file mode 100644
index 00000000000000..a687509e6ae9c6
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_fill_any_like_op_npu.py
@@ -0,0 +1,88 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import core
+
+paddle.enable_static()
+
+
+class TestFillAnyLikeNPUOp(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.place = paddle.NPUPlace(0)
+        self.op_type = "fill_any_like"
+        self.dtype = np.float32
+        self.shape = [2, 3, 4, 5]
+        self.value = 0.0
+
+        self.init()
+
+        self.inputs = {'X': np.random.random(self.shape).astype(self.dtype)}
+        self.attrs = {'value': self.value}
+        self.outputs = {'Out': np.full(self.shape, self.value, self.dtype)}
+
+    def init(self):
+        pass
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+
+class TestFillAnyLikeNPUOpInt32(TestFillAnyLikeNPUOp):
+    def init(self):
+        self.dtype = np.int32
+        self.value = -1
+
+
+class TestFillAnyLikeNPUOpFloat32(TestFillAnyLikeNPUOp):
+    def init(self):
+        self.dtype = np.float32
+        self.value = 0.09
+
+
+class TestFillAnyLikeNPUOpFloat16(TestFillAnyLikeNPUOp):
+    def init(self):
+        self.dtype = np.float16
+        self.value = 0.05
+
+
+class TestFillAnyLikeNPUOpValue1(TestFillAnyLikeNPUOp):
+    def init(self):
+        self.value = 1.0
+
+
+class TestFillAnyLikeNPUOpValue2(TestFillAnyLikeNPUOp):
+    def init(self):
+        self.value = 1e-9
+
+
+class TestFillAnyLikeNPUOpShape(TestFillAnyLikeNPUOp):
+    def init(self):
+        self.shape = [12, 10]
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_fill_constant_batch_size_like_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_fill_constant_batch_size_like_op_npu.py
new file mode 100644
index 00000000000000..7736c85c87aa29
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_fill_constant_batch_size_like_op_npu.py
@@ -0,0 +1,134 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import core
+
+paddle.enable_static()
+SEED = 2021
+
+
+class TestFillConstantBatchSizeLike(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.place = paddle.NPUPlace(0)
+        self.op_type = "fill_constant_batch_size_like"
+        self.init_shape()
+        self.init_value()
+        self.init_dtype()
+        self.init_force_cpu()
+        self.init_dim_idx()
+
+        self.inputs = {
+            'Input': np.random.random(self.input_shape).astype("float32")
+        }
+        self.attrs = {
+            'shape': self.shape,
+            'value': self.value,
+            'str_value': self.str_value,
+            'dtype': self.dtype,
+            'force_cpu': self.force_cpu,
+            'input_dim_idx': self.input_dim_idx,
+            'output_dim_idx': self.output_dim_idx
+        }
+        self.outputs = {
+            'Out': np.full(self.output_shape, self.output_value,
+                           self.output_dtype)
+        }
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_shape(self):
+        self.input_shape = [4, 5]
+        self.shape = [123, 92]
+        self.output_shape = (4, 92)
+
+    def init_value(self):
+        self.value = 3.8
+        self.str_value = ''
+        self.output_value = 3.8
+
+    def init_dtype(self):
+        self.dtype = core.VarDesc.VarType.FP32
+        self.output_dtype = np.float32
+
+    def init_force_cpu(self):
+        self.force_cpu = False
+
+    def init_dim_idx(self):
+        self.input_dim_idx = 0
+        self.output_dim_idx = 0
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+
+class TestFillConstantBatchSizeLike2(TestFillConstantBatchSizeLike):
+    def init_shape(self):
+        # test shape
+        self.input_shape = [4, 5, 6, 7]
+        self.shape = [10, 123, 92]
+        self.output_shape = (4, 123, 92)
+
+
+class TestFillConstantBatchSizeLike3(TestFillConstantBatchSizeLike):
+    def init_value(self):
+        # use 'str_value' rather than 'value'
+        self.value = 3.8
+        self.str_value = '4.5'
+        self.output_value = 4.5
+
+
+class TestFillConstantBatchSizeLike6(TestFillConstantBatchSizeLike):
+    def init_dtype(self):
+        self.dtype = core.VarDesc.VarType.FP16
+        self.output_dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, atol=1e-2)
+
+
+class TestFillConstantBatchSizeLike7(TestFillConstantBatchSizeLike):
+    def init_dtype(self):
+        self.dtype = core.VarDesc.VarType.INT32
+        self.output_dtype = np.int32
+
+
+class TestFillConstantBatchSizeLike8(TestFillConstantBatchSizeLike):
+    def init_force_cpu(self):
+        self.force_cpu = True
+
+
+class TestFillConstantBatchSizeLike9(TestFillConstantBatchSizeLike):
+    def init_shape(self):
+        self.input_shape = [4, 5]
+        self.shape = [123, 92]
+        self.output_shape = (123, 4)
+
+    def init_dim_idx(self):
+        self.input_dim_idx = 0
+        self.output_dim_idx = 1
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_flatten2_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_flatten2_op_npu.py
new file mode 100755
index 00000000000000..acd7ca770164e5
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_flatten2_op_npu.py
@@ -0,0 +1,82 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import sys
+sys.path.append("..")
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+from op_test import OpTest
+paddle.enable_static()
+
+
+class TestFlatten2Op(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "flatten2"
+        self.place = paddle.NPUPlace(0)
+        self.init_test_case()
+        self.inputs = {"X": np.random.random(self.in_shape).astype("float64")}
+        self.init_attrs()
+        self.outputs = {
+            "Out": self.inputs["X"].reshape(self.new_shape),
+            "XShape": np.random.random(self.in_shape).astype("float32")
+        }
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, no_check_set=["XShape"])
+
+    def test_check_grad(self):
+        self.check_grad_with_place(self.place, ["X"], "Out")
+
+    def init_test_case(self):
+        self.in_shape = (3, 2, 4, 5)
+        self.axis = 1
+        self.new_shape = (3, 40)
+
+    def init_attrs(self):
+        self.attrs = {"axis": self.axis}
+
+
+class TestFlatten2OpWithCornerAxis(TestFlatten2Op):
+    def init_test_case(self):
+        self.in_shape = (3, 2, 5, 4)
+        self.axis = 0
+        self.new_shape = (1, 120)
+
+
+class TestFlatten2OpWithDefaultAxis(TestFlatten2Op):
+    def init_test_case(self):
+        self.in_shape = (10, 2, 2, 3)
+        self.new_shape = (10, 12)
+
+    def init_attrs(self):
+        self.attrs = {}
+
+
+class TestFlatten2OpSixDims(TestFlatten2Op):
+    def init_test_case(self):
+        self.in_shape = (3, 2, 3, 2, 4, 4)
+        self.axis = 4
+        self.new_shape = (36, 16)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_flatten_contiguous_range_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_flatten_contiguous_range_op_npu.py
new file mode 100644
index 00000000000000..88e711dcf068e6
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_flatten_contiguous_range_op_npu.py
@@ -0,0 +1,318 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+
+
+class TestFlattenOp(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "flatten_contiguous_range"
+        self.place = paddle.NPUPlace(0)
+
+        self.start_axis = 0
+        self.stop_axis = -1
+        self.dtype = np.float64
+        self.init_test_case()
+        self.inputs = {"X": np.random.random(self.in_shape).astype(self.dtype)}
+        self.init_attrs()
+        self.outputs = {
+            "Out": self.inputs["X"].reshape(self.new_shape),
+            "XShape": np.random.random(self.in_shape).astype("float32")
+        }
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, no_check_set=["XShape"])
+
+    def test_check_grad(self):
+        pass
+
+    def init_test_case(self):
+        self.in_shape = (3, 2, 5, 4)
+        self.start_axis = 0
+        self.stop_axis = -1
+        self.new_shape = (120)
+
+    def init_attrs(self):
+        self.attrs = {
+            "start_axis": self.start_axis,
+            "stop_axis": self.stop_axis
+        }
+
+
+class TestFlattenOp_1(TestFlattenOp):
+    def init_test_case(self):
+        self.in_shape = (3, 2, 5, 4)
+        self.start_axis = 1
+        self.stop_axis = 2
+        self.new_shape = (3, 10, 4)
+
+    def init_attrs(self):
+        self.attrs = {
+            "start_axis": self.start_axis,
+            "stop_axis": self.stop_axis
+        }
+
+
+class TestFlattenOp_2(TestFlattenOp):
+    def init_test_case(self):
+        self.in_shape = (3, 2, 5, 4)
+        self.start_axis = 0
+        self.stop_axis = 1
+        self.new_shape = (6, 5, 4)
+
+    def init_attrs(self):
+        self.attrs = {
+            "start_axis": self.start_axis,
+            "stop_axis": self.stop_axis
+        }
+
+
+class TestFlattenOp_3(TestFlattenOp):
+    def init_test_case(self):
+        self.in_shape = (3, 2, 5, 4)
+        self.start_axis = 0
+        self.stop_axis = 2
+        self.new_shape = (30, 4)
+
+    def init_attrs(self):
+        self.attrs = {
+            "start_axis": self.start_axis,
+            "stop_axis": self.stop_axis
+        }
+
+
+class TestFlattenOp_4(TestFlattenOp):
+    def init_test_case(self):
+        self.in_shape = (3, 2, 5, 4)
+        self.start_axis = -2
+        self.stop_axis = -1
+        self.new_shape = (3, 2, 20)
+
+    def init_attrs(self):
+        self.attrs = {
+            "start_axis": self.start_axis,
+            "stop_axis": self.stop_axis
+        }
+
+
+class TestFlattenOp_5(TestFlattenOp):
+    def init_test_case(self):
+        self.in_shape = (3, 2, 5, 4)
+        self.start_axis = 2
+        self.stop_axis = 2
+        self.new_shape = (3, 2, 5, 4)
+
+    def init_attrs(self):
+        self.attrs = {
+            "start_axis": self.start_axis,
+            "stop_axis": self.stop_axis
+        }
+
+
+class TestFlattenOpSixDims(TestFlattenOp):
+    def init_test_case(self):
+        self.in_shape = (3, 2, 3, 2, 4, 4)
+        self.start_axis = 3
+        self.stop_axis = 5
+        self.new_shape = (3, 2, 3, 32)
+
+    def init_attrs(self):
+        self.attrs = {
+            "start_axis": self.start_axis,
+            "stop_axis": self.stop_axis
+        }
+
+
+class TestFlattenOp_Float32(TestFlattenOp):
+    def init_test_case(self):
+        self.in_shape = (3, 2, 5, 4)
+        self.start_axis = 0
+        self.stop_axis = 1
+        self.new_shape = (6, 5, 4)
+        self.dtype = np.float32
+
+    def init_attrs(self):
+        self.attrs = {
+            "start_axis": self.start_axis,
+            "stop_axis": self.stop_axis
+        }
+
+
+class TestFlattenOp_int(TestFlattenOp):
+    def init_test_case(self):
+        self.in_shape = (3, 2, 5, 4)
+        self.start_axis = 0
+        self.stop_axis = 1
+        self.new_shape = (6, 5, 4)
+        self.dtype = np.int
+
+    def init_attrs(self):
+        self.attrs = {
+            "start_axis": self.start_axis,
+            "stop_axis": self.stop_axis
+        }
+
+
+class TestFlattenOp_uint8(TestFlattenOp):
+    def init_test_case(self):
+        self.in_shape = (3, 2, 5, 4)
+        self.start_axis = 0
+        self.stop_axis = 1
+        self.new_shape = (6, 5, 4)
+        self.dtype = np.uint8
+
+    def init_attrs(self):
+        self.attrs = {
+            "start_axis": self.start_axis,
+            "stop_axis": self.stop_axis
+        }
+
+
+class TestFlattenOp_int8(TestFlattenOp):
+    def init_test_case(self):
+        self.in_shape = (3, 2, 5, 4)
+        self.start_axis = 0
+        self.stop_axis = 1
+        self.new_shape = (6, 5, 4)
+        self.dtype = np.int8
+
+    def init_attrs(self):
+        self.attrs = {
+            "start_axis": self.start_axis,
+            "stop_axis": self.stop_axis
+        }
+
+
+class TestFlattenOp_int64(TestFlattenOp):
+    def init_test_case(self):
+        self.in_shape = (3, 2, 5, 4)
+        self.start_axis = 0
+        self.stop_axis = 1
+        self.new_shape = (6, 5, 4)
+        self.dtype = np.int64
+
+    def init_attrs(self):
+        self.attrs = {
+            "start_axis": self.start_axis,
+            "stop_axis": self.stop_axis
+        }
+
+
+class TestFlatten2OpError(unittest.TestCase):
+    def test_errors(self):
+        image_shape = (2, 3, 4, 4)
+        x = np.arange(image_shape[0] * image_shape[1] * image_shape[2] *
+                      image_shape[3]).reshape(image_shape) / 100.
+        x = x.astype('float32')
+
+        def test_ValueError1():
+            x_var = paddle.static.data(
+                name="x", shape=image_shape, dtype='float32')
+            out = paddle.flatten(x_var, start_axis=2, stop_axis=1)
+
+        self.assertRaises(ValueError, test_ValueError1)
+
+        def test_ValueError2():
+            x_var = paddle.static.data(
+                name="x", shape=image_shape, dtype='float32')
+            paddle.flatten(x_var, start_axis=10, stop_axis=1)
+
+        self.assertRaises(ValueError, test_ValueError2)
+
+        def test_ValueError3():
+            x_var = paddle.static.data(
+                name="x", shape=image_shape, dtype='float32')
+            paddle.flatten(x_var, start_axis=2, stop_axis=10)
+
+        self.assertRaises(ValueError, test_ValueError3)
+
+        def test_type():
+            # dtype must be float32, float64, int8, int32, int64, uint8.
+            x2 = np.arange(image_shape[0] * image_shape[1] * image_shape[2] *
+                           image_shape[3]).reshape(image_shape) / 100.
+            x2 = x2.astype('float16')
+            x2_var = paddle.fluid.data(
+                name='x2', shape=[3, 2, 4, 5], dtype='float16')
+            paddle.flatten(x2_var)
+
+        self.assertRaises(TypeError, test_type)
+
+        def test_InputError():
+            out = paddle.flatten(x)
+
+        self.assertRaises(ValueError, test_InputError)
+
+
+class TestStaticFlattenPythonAPI(unittest.TestCase):
+    def execute_api(self, x, start_axis=0, stop_axis=-1):
+        return paddle.flatten(x, start_axis, stop_axis)
+
+    def test_static_api(self):
+        paddle.enable_static()
+        np_x = np.random.rand(2, 3, 4, 4).astype('float32')
+
+        main_prog = paddle.static.Program()
+        with paddle.static.program_guard(main_prog, paddle.static.Program()):
+            x = paddle.static.data(
+                name="x", shape=[2, 3, 4, 4], dtype='float32')
+            out = self.execute_api(x, start_axis=-2, stop_axis=-1)
+
+        exe = paddle.static.Executor(place=paddle.NPUPlace(0))
+        fetch_out = exe.run(main_prog, feed={"x": np_x}, fetch_list=[out])
+        self.assertTrue((2, 3, 16) == fetch_out[0].shape)
+
+
+class TestStaticInplaceFlattenPythonAPI(TestStaticFlattenPythonAPI):
+    def execute_api(self, x, start_axis=0, stop_axis=-1):
+        return x.flatten_(start_axis, stop_axis)
+
+
+class TestFlattenPython(unittest.TestCase):
+    def test_python_api(self):
+        image_shape = (2, 3, 4, 4)
+        x = np.arange(image_shape[0] * image_shape[1] * image_shape[2] *
+                      image_shape[3]).reshape(image_shape) / 100.
+        x = x.astype('float32')
+
+        def test_InputError():
+            out = paddle.flatten(x)
+
+        self.assertRaises(ValueError, test_InputError)
+
+        def test_Negative():
+            paddle.disable_static(paddle.NPUPlace(0))
+            img = paddle.to_tensor(x)
+            out = paddle.flatten(img, start_axis=-2, stop_axis=-1)
+            return out.numpy().shape
+
+        res_shape = test_Negative()
+        self.assertTrue((2, 3, 16) == res_shape)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_index_select_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_index_select_op_npu.py
new file mode 100644
index 00000000000000..ff0d57d1d4da10
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_index_select_op_npu.py
@@ -0,0 +1,153 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+from paddle.static import Program, program_guard
+
+paddle.enable_static()
+SEED = 2021
+
+
+class TestNPUIndexSelect(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.place = paddle.NPUPlace(0)
+        self.op_type = "index_select"
+        self.config()
+
+        x_np = np.random.random(self.x_shape).astype(self.x_type)
+        index_np = np.random.randint(
+            low=0, high=self.x_shape[self.dim], size=self.index_size)
+
+        # compute real output as baseline.
+        outer_loop = np.prod(self.x_shape[:self.dim])
+        outer_loop = outer_loop.astype(self.index_type)
+        x_reshape = [outer_loop] + list(self.x_shape[self.dim:])
+        x_np_reshape = np.reshape(x_np, tuple(x_reshape))
+
+        out_list = []
+        for i in range(outer_loop):
+            for j in range(self.index_size):
+                out_list.append(x_np_reshape[i, index_np[j]])
+        self.out_shape = list(self.x_shape)
+        self.out_shape[self.dim] = self.index_size
+        self.out_shape = tuple(self.out_shape)
+        out = np.reshape(out_list, self.out_shape)
+
+        self.inputs = {'X': x_np, 'Index': index_np}
+        self.attrs = {'dim': self.dim}
+        self.outputs = {'Out': out}
+
+    # todo: comment second line when index_select grad npu op is ready. 
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.__class__.no_need_check_grad = True
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    # todo: replace first line with second line when index_select grad npu op is ready. 
+    def test_check_grad(self):
+        pass
+        #self.check_grad_with_place(self.place, ['X'], 'Out')
+
+    def config(self):
+        self.x_shape = (100, 4, 5)
+        self.x_type = np.float32
+        self.dim = 1
+        self.index_size = 100
+        self.index_type = np.int64
+
+
+class TestNPUIndexSelectCase2(TestNPUIndexSelect):
+    def config(self):
+        self.dim = -2
+        self.x_type = np.float32
+        self.index_type = np.int32
+        self.x_shape = (10, 10, 4, 10)
+        self.index_size = 10
+
+
+class TestNPUIndexSelectAPI(unittest.TestCase):
+    def input_data(self):
+        self.data_x = np.array([[1.0, 2.0, 3.0, 4.0], [5.0, 6.0, 7.0, 8.0],
+                                [9.0, 10.0, 11.0, 12.0]]).astype('float32')
+        self.data_index = np.array([0, 1, 1]).astype('int32')
+
+    def test_index_select_api(self):
+        paddle.set_device("npu:0")
+        paddle.enable_static()
+        self.input_data()
+
+        # case 1:
+        with program_guard(Program(), Program()):
+            x = paddle.static.data(name='x', shape=[-1, 4], dtype='float32')
+            index = paddle.static.data(name='index', shape=[3], dtype='int32')
+            z = paddle.index_select(x, index, axis=1)
+            exe = paddle.static.Executor(paddle.NPUPlace(0))
+            res, = exe.run(feed={'x': self.data_x,
+                                 'index': self.data_index},
+                           fetch_list=[z.name],
+                           return_numpy=False)
+        expect_out = np.array([[1.0, 2.0, 2.0], [5.0, 6.0, 6.0],
+                               [9.0, 10.0, 10.0]]).astype('float32')
+        self.assertTrue(np.allclose(expect_out, np.array(res)))
+
+        # case 2:
+        with program_guard(Program(), Program()):
+            x = paddle.static.data(name='x', shape=[-1, 4], dtype='float32')
+            index = paddle.static.data(name='index', shape=[3], dtype='int32')
+            z = paddle.index_select(x, index)
+            exe = paddle.static.Executor(paddle.NPUPlace(0))
+            res, = exe.run(feed={'x': self.data_x,
+                                 'index': self.data_index},
+                           fetch_list=[z.name],
+                           return_numpy=False)
+        expect_out = np.array([[1.0, 2.0, 3.0, 4.0], [5.0, 6.0, 7.0, 8.0],
+                               [5.0, 6.0, 7.0, 8.0]]).astype('float32')
+        self.assertTrue(np.allclose(expect_out, np.array(res)))
+
+    def test_dygraph_index_select_api(self):
+        paddle.set_device("npu:0")
+        paddle.disable_static()
+        self.input_data()
+
+        # case 1:
+        x = paddle.to_tensor(self.data_x)
+        index = paddle.to_tensor(self.data_index)
+        z = paddle.index_select(x, index)
+        np_z = z.numpy()
+        expect_out = np.array([[1.0, 2.0, 3.0, 4.0], [5.0, 6.0, 7.0, 8.0],
+                               [5.0, 6.0, 7.0, 8.0]]).astype('float32')
+        self.assertTrue(np.allclose(expect_out, np_z))
+
+        # case 2:
+        x = paddle.to_tensor(self.data_x)
+        index = paddle.to_tensor(self.data_index)
+        z = paddle.index_select(x, index, axis=1)
+        np_z = z.numpy()
+        expect_out = np.array([[1.0, 2.0, 2.0], [5.0, 6.0, 6.0],
+                               [9.0, 10.0, 10.0]]).astype('float32')
+        self.assertTrue(np.allclose(expect_out, np_z))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_one_hot_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_one_hot_op_npu.py
new file mode 100644
index 00000000000000..c92fffb2d26cbf
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_one_hot_op_npu.py
@@ -0,0 +1,193 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import sys
+import unittest
+import numpy as np
+sys.path.append("..")
+
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.framework import Program, program_guard
+
+paddle.enable_static()
+
+
+class TestOneHotOp(OpTest):
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def setUp(self):
+        self.set_npu()
+        self.op_type = 'one_hot'
+        depth = 10
+        depth_np = np.array(10).astype('int32')
+        dimension = 12
+        x_lod = [[4, 1, 3, 3]]
+        x = [np.random.randint(0, depth - 1) for i in range(sum(x_lod[0]))]
+        x = np.array(x).astype('int32').reshape([sum(x_lod[0]), 1])
+
+        out = np.zeros(shape=(np.product(x.shape[:-1]),
+                              depth)).astype('float32')
+
+        for i in range(np.product(x.shape)):
+            out[i, x[i]] = 1.0
+
+        self.inputs = {'X': (x, x_lod), 'depth_tensor': depth_np}
+        self.attrs = {'dtype': int(core.VarDesc.VarType.FP32)}
+        self.outputs = {'Out': (out, x_lod)}
+
+    def test_check_output(self):
+        self.check_output_with_place(paddle.NPUPlace(0), check_dygraph=False)
+
+
+class TestOneHotOp_attr(OpTest):
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def setUp(self):
+        self.set_npu()
+        self.op_type = 'one_hot'
+        depth = 10
+        dimension = 12
+        x_lod = [[4, 1, 3, 3]]
+        x = [np.random.randint(0, depth - 1) for i in range(sum(x_lod[0]))]
+        x = np.array(x).astype('int32').reshape([sum(x_lod[0]), 1])
+
+        out = np.zeros(shape=(np.product(x.shape[:-1]),
+                              depth)).astype('float32')
+
+        for i in range(np.product(x.shape)):
+            out[i, x[i]] = 1.0
+
+        self.inputs = {'X': (x, x_lod)}
+        self.attrs = {'dtype': int(core.VarDesc.VarType.FP32), 'depth': depth}
+        self.outputs = {'Out': (out, x_lod)}
+
+    def test_check_output(self):
+        self.check_output_with_place(paddle.NPUPlace(0), check_dygraph=False)
+
+
+class TestOneHotOp_default_dtype(OpTest):
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def setUp(self):
+        self.set_npu()
+        self.op_type = 'one_hot'
+        depth = 10
+        depth_np = np.array(10).astype('int32')
+        dimension = 12
+        x_lod = [[4, 1, 3, 3]]
+        x = [np.random.randint(0, depth - 1) for i in range(sum(x_lod[0]))]
+        x = np.array(x).astype('int32').reshape([sum(x_lod[0]), 1])
+
+        out = np.zeros(shape=(np.product(x.shape[:-1]),
+                              depth)).astype('float32')
+
+        for i in range(np.product(x.shape)):
+            out[i, x[i]] = 1.0
+
+        self.inputs = {'X': (x, x_lod), 'depth_tensor': depth_np}
+        self.attrs = {}
+        self.outputs = {'Out': (out, x_lod)}
+
+    def test_check_output(self):
+        self.check_output_with_place(paddle.NPUPlace(0), check_dygraph=False)
+
+
+class TestOneHotOp_default_dtype_attr(OpTest):
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def setUp(self):
+        self.set_npu()
+        self.op_type = 'one_hot'
+        depth = 10
+        dimension = 12
+        x_lod = [[4, 1, 3, 3]]
+        x = [np.random.randint(0, depth - 1) for i in range(sum(x_lod[0]))]
+        x = np.array(x).astype('int32').reshape([sum(x_lod[0]), 1])
+
+        out = np.zeros(shape=(np.product(x.shape[:-1]),
+                              depth)).astype('float32')
+
+        for i in range(np.product(x.shape)):
+            out[i, x[i]] = 1.0
+
+        self.inputs = {'X': (x, x_lod)}
+        self.attrs = {'depth': depth}
+        self.outputs = {'Out': (out, x_lod)}
+
+    def test_check_output(self):
+        self.check_output_with_place(paddle.NPUPlace(0), check_dygraph=False)
+
+
+class TestOneHotOp_out_of_range(OpTest):
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def setUp(self):
+        self.set_npu()
+        self.op_type = 'one_hot'
+        depth = 10
+        x_lod = [[4, 1, 3, 3]]
+        x = [np.random.choice([-1, depth]) for i in range(sum(x_lod[0]))]
+        x = np.array(x).astype('int32').reshape([sum(x_lod[0]), 1])
+
+        out = np.zeros(shape=(np.product(x.shape[:-1]),
+                              depth)).astype('float32')
+
+        self.inputs = {'X': (x, x_lod)}
+        self.attrs = {'depth': depth, 'allow_out_of_range': True}
+        self.outputs = {'Out': (out, x_lod)}
+
+    def test_check_output(self):
+        self.check_output_with_place(paddle.NPUPlace(0), check_dygraph=False)
+
+
+class TestOneHotOp_dtype_int64(OpTest):
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def setUp(self):
+        self.set_npu()
+        self.op_type = 'one_hot'
+        depth = 10
+        dimension = 12
+        x_lod = [[4, 1, 3, 3]]
+        x = [np.random.randint(0, depth - 1) for i in range(sum(x_lod[0]))]
+        x = np.array(x).astype('int64').reshape([sum(x_lod[0]), 1])
+
+        out = np.zeros(shape=(np.product(x.shape[:-1]),
+                              depth)).astype('float32')
+
+        for i in range(np.product(x.shape)):
+            out[i, x[i]] = 1.0
+
+        self.inputs = {'X': (x, x_lod)}
+        self.attrs = {'depth': depth}
+        self.outputs = {'Out': (out, x_lod)}
+
+    def test_check_output(self):
+        self.check_output_with_place(paddle.NPUPlace(0), check_dygraph=False)
+
+
+if __name__ == '__main__':
+    paddle.enable_static()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_reduce_max_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_reduce_max_op_npu.py
index a5d078ced28767..f6c346159b8bee 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_reduce_max_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_reduce_max_op_npu.py
@@ -127,8 +127,6 @@ def setUp(self):
             'out_dtype': int(core.VarDesc.VarType.INT16)
         }
 
-        self.out = self.inputs['X'].max(axis=tuple(self.attrs['dim']))
-
         self.outputs = {
             'Out':
             self.inputs['X'].max(axis=tuple(self.attrs['dim'])).astype(np.int16)
@@ -195,9 +193,6 @@ def setUp(self):
             'dim': [-2, -1],
             'out_dtype': int(core.VarDesc.VarType.FP16)
         }
-
-        self.out = self.inputs['X'].max(axis=tuple(self.attrs['dim']))
-
         self.outputs = {
             'Out': self.inputs['X'].max(
                 axis=tuple(self.attrs['dim'])).astype(np.float16)
diff --git a/python/paddle/fluid/tests/unittests/npu/test_reduce_prod_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_reduce_prod_op_npu.py
new file mode 100644
index 00000000000000..59f181be5edacb
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_reduce_prod_op_npu.py
@@ -0,0 +1,235 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from paddle.fluid.tests.unittests.op_test import OpTest, skip_check_grad_ci
+import paddle
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+from paddle.fluid import compiler, Program, program_guard
+from paddle.fluid.framework import convert_np_dtype_to_dtype_
+
+paddle.enable_static()
+
+
+class TestNPUReduceProd(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_prod"
+        self.set_npu()
+        self.init_dtype()
+
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)}
+        self.attrs = {'dim': [0]}
+        self.outputs = {
+            'Out': self.inputs['X'].prod(axis=tuple(self.attrs['dim']))
+        }
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.place = paddle.NPUPlace(0)
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+
+class TestNPUReduceProd2(TestNPUReduceProd):
+    def setUp(self):
+        self.op_type = "reduce_prod"
+        self.set_npu()
+        self.init_dtype()
+
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)}
+        self.attrs = {}  # default 'dim': [0]
+        self.outputs = {'Out': self.inputs['X'].prod(axis=tuple([0]))}
+
+
+class TestNPUReduceProd3(TestNPUReduceProd):
+    def setUp(self):
+        self.op_type = "reduce_prod"
+        self.set_npu()
+        self.init_dtype()
+
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)}
+        # self.attrs = {'dim': [0]}
+        self.outputs = {'Out': self.inputs['X'].prod(axis=tuple([0]))}
+
+
+class TestNPUReduceProd6D(TestNPUReduceProd):
+    def setUp(self):
+        self.op_type = "reduce_prod"
+        self.set_npu()
+        self.init_dtype()
+
+        self.inputs = {
+            'X': np.random.random((5, 6, 2, 3, 4, 2)).astype(self.dtype)
+        }
+        self.attrs = {'dim': [2, 3, 4]}
+        self.outputs = {
+            'Out': self.inputs['X'].prod(axis=tuple(self.attrs['dim']))
+        }
+
+
+class TestNPUReduceProd8D(TestNPUReduceProd):
+    def setUp(self):
+        self.op_type = "reduce_prod"
+        self.set_npu()
+        self.init_dtype()
+
+        self.inputs = {
+            'X': np.random.random((2, 5, 3, 2, 2, 3, 4, 2)).astype(self.dtype)
+        }
+        self.attrs = {'dim': [2, 3, 4]}
+        self.outputs = {
+            'Out': self.inputs['X'].prod(axis=tuple(self.attrs['dim']))
+        }
+
+
+class TestReduceAll(TestNPUReduceProd):
+    def setUp(self):
+        self.op_type = "reduce_prod"
+        self.set_npu()
+        self.init_dtype()
+
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)}
+        self.attrs = {'reduce_all': True}
+        self.outputs = {'Out': self.inputs['X'].prod()}
+
+
+class TestNPUReduceProdWithOutDtype_bool(TestNPUReduceProd):
+    def setUp(self):
+        self.op_type = "reduce_prod"
+        self.set_npu()
+        self.init_dtype()
+
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)}
+        self.attrs = {'dim': [0], 'out_dtype': int(core.VarDesc.VarType.BOOL)}
+        self.outputs = {
+            'Out':
+            self.inputs['X'].prod(axis=tuple(self.attrs['dim'])).astype(np.bool)
+        }
+
+
+class TestNPUReduceProdWithOutDtype_int16(TestNPUReduceProd):
+    def setUp(self):
+        self.op_type = "reduce_prod"
+        self.set_npu()
+        self.init_dtype()
+
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)}
+        self.attrs = {'dim': [0], 'out_dtype': int(core.VarDesc.VarType.INT16)}
+        self.outputs = {
+            'Out': self.inputs['X'].prod(
+                axis=tuple(self.attrs['dim'])).astype(np.int16)
+        }
+
+
+class TestNPUReduceProdWithOutDtype_int32(TestNPUReduceProd):
+    def setUp(self):
+        self.op_type = "reduce_prod"
+        self.set_npu()
+        self.init_dtype()
+
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)}
+        self.attrs = {'dim': [0], 'out_dtype': int(core.VarDesc.VarType.INT32)}
+        self.outputs = {
+            'Out': self.inputs['X'].prod(
+                axis=tuple(self.attrs['dim'])).astype(np.int32)
+        }
+
+
+class TestNPUReduceProdWithOutDtype_int64(TestNPUReduceProd):
+    def setUp(self):
+        self.op_type = "reduce_prod"
+        self.set_npu()
+        self.init_dtype()
+
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)}
+        self.attrs = {'dim': [0], 'out_dtype': int(core.VarDesc.VarType.INT64)}
+        self.outputs = {
+            'Out': self.inputs['X'].prod(
+                axis=tuple(self.attrs['dim'])).astype(np.int64)
+        }
+
+
+class TestNPUReduceProdWithOutDtype_fp16(TestNPUReduceProd):
+    def setUp(self):
+        self.op_type = "reduce_prod"
+        self.set_npu()
+        self.init_dtype()
+
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)}
+        self.attrs = {'dim': [0], 'out_dtype': int(core.VarDesc.VarType.FP16)}
+        self.outputs = {
+            'Out': self.inputs['X'].prod(
+                axis=tuple(self.attrs['dim'])).astype(np.float16)
+        }
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, atol=1e-3)
+
+
+class TestNPUReduceProdWithOutDtype_fp32(TestNPUReduceProd):
+    def setUp(self):
+        self.op_type = "reduce_prod"
+        self.set_npu()
+        self.init_dtype()
+
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)}
+        self.attrs = {'dim': [0], 'out_dtype': int(core.VarDesc.VarType.FP32)}
+        self.outputs = {
+            'Out': self.inputs['X'].prod(
+                axis=tuple(self.attrs['dim'])).astype(np.float32)
+        }
+
+
+class TestNPUReduceProdWithOutDtype_fp64(TestNPUReduceProd):
+    def setUp(self):
+        self.op_type = "reduce_prod"
+        self.set_npu()
+        self.init_dtype()
+
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)}
+        self.attrs = {'dim': [0], 'out_dtype': int(core.VarDesc.VarType.FP64)}
+        self.outputs = {
+            'Out': self.inputs['X'].prod(
+                axis=tuple(self.attrs['dim'])).astype(np.float64)
+        }
+
+
+@skip_check_grad_ci(reason="right now not implement grad op")
+class TestNPUReduceProdWithOutDtype_fp32_2(TestNPUReduceProd):
+    def setUp(self):
+        self.op_type = "reduce_prod"
+        self.set_npu()
+        self.init_dtype()
+
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)}
+        self.attrs = {'dim': [0], 'out_dtype': int(core.VarDesc.VarType.FP32)}
+        self.outputs = {
+            'Out': self.inputs['X'].prod(
+                axis=tuple(self.attrs['dim'])).astype(np.float32)
+        }
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_relu6_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_relu6_op_npu.py
new file mode 100644
index 00000000000000..601a351c015f32
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_relu6_op_npu.py
@@ -0,0 +1,166 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import paddle.fluid as fluid
+import paddle
+from op_test import OpTest
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+
+paddle.enable_static()
+SEED = 2021
+
+
+def ref_relu6(x, threshold=6.0):
+    out = np.copy(x)
+    out[np.abs(x - threshold) < 0.005] = threshold + 0.02
+    out = np.minimum(np.maximum(x, 0), threshold)
+    return out
+
+
+class TestRelu6(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "relu6"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.uniform(-1, 10, [10, 12]).astype(self.dtype)
+        x[np.abs(x) < 0.005] = 0.02
+        out = ref_relu6(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.attrs = {'threshold': 6.0}
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
+        self.check_grad_with_place(self.place, ['X'], 'Out')
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+
+class TestRelu6Float16(TestRelu6):
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.__class__.no_need_check_grad = True
+
+    def set_attrs(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+
+class TestReluNeg(TestRelu6):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "relu6"
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+        np.random.seed(SEED)
+        x = np.random.uniform(-10, -1, [10, 12]).astype(self.dtype)
+        x[np.abs(x) < 0.005] = 0.02
+        out = ref_relu6(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.attrs = {'threshold': 6.0}
+        self.outputs = {'Out': out}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+
+class TestRelu6Net(unittest.TestCase):
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(32, 32)).astype('float32')
+        b_np = np.random.random(size=(32, 32)).astype('float32')
+        label_np = np.random.randint(2, size=(32, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
+            b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
+            label = paddle.static.data(
+                name="label", shape=[32, 1], dtype='int64')
+
+            sum = paddle.add(a, b)
+            z = paddle.nn.functional.relu6(sum)
+
+            fc_1 = fluid.layers.fc(input=z, size=128)
+            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
+
+            cost = fluid.layers.cross_entropy(input=prediction, label=label)
+            loss = fluid.layers.reduce_mean(cost)
+            sgd = fluid.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+
+        if run_npu:
+            place = paddle.NPUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        print("Start run on {}".format(place))
+        for epoch in range(100):
+
+            pred_res, loss_res = exe.run(
+                main_prog,
+                feed={"a": a_np,
+                      "b": b_np,
+                      "label": label_np},
+                fetch_list=[prediction, loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                    epoch, pred_res[0], loss_res))
+
+        return pred_res, loss_res
+
+    def test_npu(self):
+        cpu_pred, cpu_loss = self._test(False)
+        npu_pred, npu_loss = self._test(True)
+
+        self.assertTrue(np.allclose(npu_pred, cpu_pred))
+        self.assertTrue(np.allclose(npu_loss, cpu_loss))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_sequence_mask_npu.py b/python/paddle/fluid/tests/unittests/npu/test_sequence_mask_npu.py
new file mode 100644
index 00000000000000..21440de9fddd13
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_sequence_mask_npu.py
@@ -0,0 +1,182 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.framework import convert_np_dtype_to_dtype_, Program, program_guard
+
+paddle.enable_static()
+
+
+class SequenceMaskTestBase(OpTest):
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def initDefaultParameters(self):
+        self.op_type = 'sequence_mask'
+        self.maxlen = 10
+        self.mask_dtype = 'int64'
+        self.x = [[0, 3, 4], [5, 7, 9]]
+
+    def initParameters(self):
+        pass
+
+    def setUp(self):
+        self.set_npu()
+        self.initDefaultParameters()
+        self.initParameters()
+        if not isinstance(self.x, np.ndarray):
+            self.x = np.array(self.x)
+
+        self.inputs = {'X': self.x}
+        self.outputs = {'Y': self.calc_ground_truth_mask()}
+        self.attrs = {
+            'maxlen': self.maxlen,
+            'out_dtype': convert_np_dtype_to_dtype_(self.mask_dtype)
+        }
+
+    def calc_ground_truth_mask(self):
+        maxlen = np.max(self.x) if self.maxlen < 0 else self.maxlen
+        shape = self.x.shape + (maxlen, )
+        index_broadcast = np.broadcast_to(
+            np.reshape(
+                range(maxlen), newshape=[1] * self.x.ndim + [-1]),
+            shape=shape)
+        x_broadcast = np.broadcast_to(
+            np.reshape(
+                self.x, newshape=self.x.shape + (-1, )), shape=shape)
+        return (index_broadcast < x_broadcast).astype(self.mask_dtype)
+
+    def test_check_output(self):
+        self.check_output_with_place(paddle.NPUPlace(0))
+
+
+class SequenceMaskTest1(SequenceMaskTestBase):
+    def initParameters(self):
+        self.mask_dtype = 'bool'
+
+
+class SequenceMaskTest2(SequenceMaskTestBase):
+    def initParameters(self):
+        self.mask_dtype = 'uint8'
+
+
+class SequenceMaskTest3(SequenceMaskTestBase):
+    def initParameters(self):
+        self.mask_dtype = 'int32'
+
+
+class SequenceMaskTest4(SequenceMaskTestBase):
+    def initParameters(self):
+        self.mask_dtype = 'float32'
+
+
+class SequenceMaskTest5(SequenceMaskTestBase):
+    def initParameters(self):
+        self.mask_dtype = 'float64'
+
+
+class SequenceMaskTest6(SequenceMaskTestBase):
+    def initParameters(self):
+        self.maxlen = -1
+
+
+class SequenceMaskTestBase_tensor_attr(OpTest):
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def initDefaultParameters(self):
+        self.op_type = 'sequence_mask'
+        self.maxlen = 10
+        self.maxlen_tensor = np.ones((1), 'int32') * 10
+        self.mask_dtype = 'int64'
+        self.x = [[0, 3, 4], [5, 7, 9]]
+
+    def initParameters(self):
+        pass
+
+    def setUp(self):
+        self.set_npu()
+        self.initDefaultParameters()
+        self.initParameters()
+        if not isinstance(self.x, np.ndarray):
+            self.x = np.array(self.x)
+
+        self.inputs = {'X': self.x, 'MaxLenTensor': self.maxlen_tensor}
+        self.outputs = {'Y': self.calc_ground_truth_mask()}
+        self.attrs = {'out_dtype': convert_np_dtype_to_dtype_(self.mask_dtype)}
+
+    def calc_ground_truth_mask(self):
+        maxlen = np.max(self.x) if self.maxlen < 0 else self.maxlen
+        shape = self.x.shape + (maxlen, )
+        index_broadcast = np.broadcast_to(
+            np.reshape(
+                range(maxlen), newshape=[1] * self.x.ndim + [-1]),
+            shape=shape)
+        x_broadcast = np.broadcast_to(
+            np.reshape(
+                self.x, newshape=self.x.shape + (-1, )), shape=shape)
+        return (index_broadcast < x_broadcast).astype(self.mask_dtype)
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class SequenceMaskTest1_tensor_attr(SequenceMaskTestBase_tensor_attr):
+    def initParameters(self):
+        self.mask_dtype = 'bool'
+
+
+class SequenceMaskTest2_tensor_attr(SequenceMaskTestBase_tensor_attr):
+    def initParameters(self):
+        self.mask_dtype = 'uint8'
+
+
+class SequenceMaskTest3_tensor_attr(SequenceMaskTestBase_tensor_attr):
+    def initParameters(self):
+        self.mask_dtype = 'int32'
+
+
+class SequenceMaskTest4_tensor_attr(SequenceMaskTestBase_tensor_attr):
+    def initParameters(self):
+        self.mask_dtype = 'float32'
+
+
+class SequenceMaskTest5_tensor_attr(SequenceMaskTestBase_tensor_attr):
+    def initParameters(self):
+        self.mask_dtype = 'float64'
+
+
+class TestSequenceMaskOpError(unittest.TestCase):
+    def test_errors(self):
+        with program_guard(Program(), Program()):
+            input_data = np.random.uniform(1, 5, [4]).astype("float32")
+
+            def test_Variable():
+                # the input must be Variable
+                fluid.layers.sequence_mask(input_data, maxlen=4)
+
+            self.assertRaises(TypeError, test_Variable)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_squared_l2_norm_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_squared_l2_norm_op_npu.py
new file mode 100644
index 00000000000000..d3ee8df1cd106f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_squared_l2_norm_op_npu.py
@@ -0,0 +1,57 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+from numpy import linalg as LA
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+
+paddle.enable_static()
+
+
+class TestL2LossOp(OpTest):
+    """Test npu squared_l2_norm
+    """
+
+    def setUp(self):
+        self.set_npu()
+        self.place = paddle.NPUPlace(0)
+        self.op_type = "squared_l2_norm"
+        self.max_relative_error = 0.05
+
+        X = np.random.uniform(-1, 1, (13, 19)).astype("float32")
+        X[np.abs(X) < self.max_relative_error] = 0.1
+        self.inputs = {'X': X}
+        self.outputs = {'Out': np.square(LA.norm(X))}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def test_check_output(self):
+        self.check_output_with_place(place=self.place)
+
+    def test_check_grad(self):
+        self.check_grad_with_place(
+            self.place, ['X'],
+            'Out',
+            max_relative_error=self.max_relative_error)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_tile_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_tile_op_npu.py
new file mode 100755
index 00000000000000..0da80189f7d406
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_tile_op_npu.py
@@ -0,0 +1,245 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import compiler, Program, program_guard
+from paddle.fluid import core
+
+paddle.enable_static()
+np.random.seed(10)
+
+
+#Situation 1: repeat_times is a list (without tensor)
+class TestTileOpRank1(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.place = paddle.NPUPlace(0)
+        self.op_type = "tile"
+        self.init_data()
+
+        self.inputs = {'X': np.random.random(self.ori_shape).astype("float32")}
+        self.attrs = {'repeat_times': self.repeat_times}
+        output = np.tile(self.inputs['X'], self.repeat_times)
+        self.outputs = {'Out': output}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_data(self):
+        self.ori_shape = [100]
+        self.repeat_times = [2]
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad(self):
+        pass
+
+
+#with dimension expanding
+class TestTileOpRank2Expanding(TestTileOpRank1):
+    def init_data(self):
+        self.ori_shape = [120]
+        self.repeat_times = [2, 2]
+
+
+class TestTileOpRank2(TestTileOpRank1):
+    def init_data(self):
+        self.ori_shape = [12, 14]
+        self.repeat_times = [2, 3]
+
+
+class TestTileOpRank3_Corner(TestTileOpRank1):
+    def init_data(self):
+        self.ori_shape = (2, 10, 5)
+        self.repeat_times = (1, 1, 1)
+
+
+class TestTileOpRank3_Corner2(TestTileOpRank1):
+    def init_data(self):
+        self.ori_shape = (2, 10, 5)
+        self.repeat_times = (2, 2)
+
+
+class TestTileOpRank3(TestTileOpRank1):
+    def init_data(self):
+        self.ori_shape = (2, 4, 15)
+        self.repeat_times = (2, 1, 4)
+
+
+class TestTileOpRank4(TestTileOpRank1):
+    def init_data(self):
+        self.ori_shape = (2, 4, 5, 7)
+        self.repeat_times = (3, 2, 1, 2)
+
+
+# Situation 2: repeat_times is a list (with tensor)
+class TestTileOpRank1_tensor_attr(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.place = paddle.NPUPlace(0)
+        self.op_type = "tile"
+        self.init_data()
+        repeat_times_tensor = []
+        for index, ele in enumerate(self.repeat_times):
+            repeat_times_tensor.append(("x" + str(index), np.ones(
+                (1)).astype('int32') * ele))
+
+        self.inputs = {
+            'X': np.random.random(self.ori_shape).astype("float32"),
+            'repeat_times_tensor': repeat_times_tensor,
+        }
+        self.attrs = {"repeat_times": self.infer_repeat_times}
+        output = np.tile(self.inputs['X'], self.repeat_times)
+        self.outputs = {'Out': output}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_data(self):
+        self.ori_shape = [100]
+        self.repeat_times = [2]
+        self.infer_repeat_times = [-1]
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad(self):
+        pass
+
+
+class TestTileOpRank2_Corner_tensor_attr(TestTileOpRank1_tensor_attr):
+    def init_data(self):
+        self.ori_shape = [12, 14]
+        self.repeat_times = [1, 1]
+        self.infer_repeat_times = [1, -1]
+
+
+class TestTileOpRank2_attr_tensor(TestTileOpRank1_tensor_attr):
+    def init_data(self):
+        self.ori_shape = [12, 14]
+        self.repeat_times = [2, 3]
+        self.infer_repeat_times = [-1, 3]
+
+
+# Situation 3: repeat_times is a tensor
+class TestTileOpRank1_tensor(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.place = paddle.NPUPlace(0)
+        self.op_type = "tile"
+        self.init_data()
+
+        self.inputs = {
+            'X': np.random.random(self.ori_shape).astype("float32"),
+            'RepeatTimes': np.array(self.repeat_times).astype("int32"),
+        }
+        self.attrs = {}
+        output = np.tile(self.inputs['X'], self.repeat_times)
+        self.outputs = {'Out': output}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_data(self):
+        self.ori_shape = [100]
+        self.repeat_times = [2]
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad(self):
+        pass
+
+
+class TestTileOpRank2_tensor(TestTileOpRank1_tensor):
+    def init_data(self):
+        self.ori_shape = [12, 14]
+        self.repeat_times = [2, 3]
+
+
+# Situation 4: input x is Integer
+class TestTileOpInteger(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.place = paddle.NPUPlace(0)
+        self.op_type = "tile"
+        self.inputs = {
+            'X': np.random.randint(
+                10, size=(4, 4, 5)).astype("int32")
+        }
+        self.attrs = {'repeat_times': [2, 1, 4]}
+        output = np.tile(self.inputs['X'], (2, 1, 4))
+        self.outputs = {'Out': output}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+
+# Situation 5: input x is Integer
+class TestTileOpInt64_t(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.place = paddle.NPUPlace(0)
+        self.op_type = "tile"
+        self.inputs = {
+            'X': np.random.randint(
+                10, size=(2, 4, 5)).astype("int32")
+        }
+        self.attrs = {'repeat_times': [2, 1, 4]}
+        output = np.tile(self.inputs['X'], (2, 1, 4))
+        self.outputs = {'Out': output}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+
+# Test python API
+class TestTileAPI(unittest.TestCase):
+    def test_api(self):
+        with fluid.dygraph.guard(paddle.NPUPlace(0)):
+            np_x = np.random.random([12, 14]).astype("float32")
+            x = paddle.to_tensor(np_x)
+
+            positive_2 = np.array([2]).astype("int32")
+            positive_2 = paddle.to_tensor(positive_2)
+
+            repeat_times = np.array([2, 3]).astype("int32")
+            repeat_times = paddle.to_tensor(repeat_times)
+
+            out_1 = paddle.tile(x, repeat_times=[2, 3])
+            out_2 = paddle.tile(x, repeat_times=[positive_2, 3])
+            out_3 = paddle.tile(x, repeat_times=repeat_times)
+
+            assert np.array_equal(out_1.numpy(), np.tile(np_x, (2, 3)))
+            assert np.array_equal(out_2.numpy(), np.tile(np_x, (2, 3)))
+            assert np.array_equal(out_3.numpy(), np.tile(np_x, (2, 3)))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_top_k_v2_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_top_k_v2_op_npu.py
new file mode 100755
index 00000000000000..a8242be855c80a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_top_k_v2_op_npu.py
@@ -0,0 +1,343 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid.core as core
+
+
+def numpy_topk(x, k=1, axis=-1, largest=True):
+    if axis < 0:
+        axis = len(x.shape) + axis
+    if largest:
+        indices = np.argsort(-x, axis=axis)
+    else:
+        indices = np.argsort(x, axis=axis)
+    if largest:
+        value = -np.sort(-x, axis=axis)
+    else:
+        value = np.sort(x, axis=axis)
+    indices = indices.take(indices=range(0, k), axis=axis)
+    value = value.take(indices=range(0, k), axis=axis)
+    return value, indices
+
+
+class TestTopkV2NPUOp(OpTest):
+    def setUp(self):
+        paddle.enable_static()
+        self.op_type = "top_k_v2"
+
+        self.set_npu()
+        self.set_dtype()
+        self.set_input_data()
+        self.set_attrs()
+        output, indices = numpy_topk(
+            self.input_data, axis=self.axis, k=self.k, largest=self.largest)
+
+        self.inputs = {'X': self.input_data}
+        self.attrs = {'k': self.k, 'axis': self.axis, 'largest': self.largest}
+        self.outputs = {'Out': output, 'Indices': indices}
+
+    def set_dtype(self):
+        self.dtype = np.int32
+
+    def set_attrs(self):
+        self.k = 3
+        self.axis = 1
+        self.largest = True
+
+    def set_input_data(self):
+        self.input_data = np.random.choice(
+            10000, size=(10, 20), replace=False).astype(self.dtype)
+
+    def test_check_output(self):
+        self.__class__.no_need_check_grad = True
+        if self.dtype == np.float32:
+            self.check_output_with_place(self.place, atol=1e-3)
+        else:
+            self.check_output_with_place(self.place)
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.place = paddle.NPUPlace(0)
+
+
+class TestTopkV2OpFloat16(TestTopkV2NPUOp):
+    def set_attrs(self):
+        self.k = 3
+        self.axis = 1
+        self.largest = True
+
+    def set_dtype(self):
+        self.dtype = np.float32
+
+    def set_input_data(self):
+        self.input_data = np.random.rand(3, 4).astype(self.dtype)
+
+
+class TestTopkV2OP1Int32(TestTopkV2NPUOp):
+    def set_attrs(self):
+        self.k = 3
+        self.axis = 0
+        self.largest = False
+
+
+class TestTopkV2OP2Int32(TestTopkV2NPUOp):
+    def set_attrs(self):
+        self.k = 4
+        self.axis = 0
+        self.largest = False
+
+
+class TestTopkV2OP3Int32(TestTopkV2NPUOp):
+    def set_attrs(self):
+        self.k = 6
+        self.axis = 1
+        self.largest = True
+
+
+class TestTopkV2OP4Int32(TestTopkV2NPUOp):
+    def set_attrs(self):
+        self.k = 3
+        self.axis = 1
+        self.largest = True
+
+
+class TestTopkV2Op1Int64(TestTopkV2OP1Int32):
+    def set_dtype(self):
+        self.dtype = np.int64
+
+
+class TestTopkV2Op2Int64(TestTopkV2OP2Int32):
+    def set_dtype(self):
+        self.dtype = np.int64
+
+
+class TestTopkV2Op3Int64(TestTopkV2OP3Int32):
+    def set_dtype(self):
+        self.dtype = np.int64
+
+
+class TestTopkV2Op4Int64(TestTopkV2OP4Int32):
+    def set_dtype(self):
+        self.dtype = np.int64
+
+
+class TestTopkV2Op1Float32(TestTopkV2OP1Int32):
+    def set_dtype(self):
+        self.dtype = np.float32
+
+    def set_input_data(self):
+        self.input_data = np.random.rand(10, 20).astype(self.dtype)
+
+
+class TestTopkV2Op2Float32(TestTopkV2OP2Int32):
+    def set_dtype(self):
+        self.dtype = np.float32
+
+    def set_input_data(self):
+        self.input_data = np.random.rand(10, 20).astype(self.dtype)
+
+
+class TestTopkV2Op3Float32(TestTopkV2OP3Int32):
+    def set_dtype(self):
+        self.dtype = np.float32
+
+    def set_input_data(self):
+        self.input_data = np.random.rand(10, 20).astype(self.dtype)
+
+
+class TestTopkV2Op4Float32(TestTopkV2OP4Int32):
+    def set_dtype(self):
+        self.dtype = np.float32
+
+    def set_input_data(self):
+        self.input_data = np.random.rand(10, 20).astype(self.dtype)
+
+
+class TestTopkV2Op1Float64(TestTopkV2OP1Int32):
+    def set_dtype(self):
+        self.dtype = np.float64
+
+    def set_input_data(self):
+        self.input_data = np.random.rand(10, 20).astype(self.dtype)
+
+
+class TestTopkV2Op2Float64(TestTopkV2OP2Int32):
+    def set_dtype(self):
+        self.dtype = np.float64
+
+    def set_input_data(self):
+        self.input_data = np.random.rand(10, 20).astype(self.dtype)
+
+
+class TestTopkV2Op3Float64(TestTopkV2OP3Int32):
+    def set_dtype(self):
+        self.dtype = np.float64
+
+    def set_input_data(self):
+        self.input_data = np.random.rand(10, 20).astype(self.dtype)
+
+
+class TestTopkV2Op4Float64(TestTopkV2OP4Int32):
+    def set_dtype(self):
+        self.dtype = np.float64
+
+    def set_input_data(self):
+        self.input_data = np.random.rand(10, 20).astype(self.dtype)
+
+
+class TestTopKAPI(unittest.TestCase):
+    def setUp(self):
+        self.__class__.use_npu = True
+        self.place = paddle.NPUPlace(0)
+        np.random.seed(123)
+        self.input_data = np.random.rand(6, 7, 8)
+        self.large_input_data = np.random.rand(2, 1030)
+
+    def run_dygraph(self, place):
+        paddle.disable_static(place)
+        input_tensor = paddle.to_tensor(self.input_data)
+        large_input_tensor = paddle.to_tensor(self.large_input_data)
+        # test case for basic test case 1
+        paddle_result = paddle.topk(input_tensor, k=2)
+        numpy_result = numpy_topk(self.input_data, k=2)
+        self.assertTrue(np.allclose(paddle_result[0].numpy(), numpy_result[0]))
+        self.assertTrue(np.allclose(paddle_result[1].numpy(), numpy_result[1]))
+
+        # test case for basic test case 2 with axis
+        paddle_result = paddle.topk(input_tensor, k=2, axis=1)
+        numpy_result = numpy_topk(self.input_data, k=2, axis=1)
+        self.assertTrue(np.allclose(paddle_result[0].numpy(), numpy_result[0]))
+        self.assertTrue(np.allclose(paddle_result[1].numpy(), numpy_result[1]))
+        # test case for basic test case 3 with tensor K
+        k_tensor = paddle.to_tensor(np.array([2]))
+        paddle_result = paddle.topk(input_tensor, k=k_tensor, axis=1)
+        numpy_result = numpy_topk(self.input_data, k=2, axis=1)
+        self.assertTrue(np.allclose(paddle_result[0].numpy(), numpy_result[0]))
+        self.assertTrue(np.allclose(paddle_result[1].numpy(), numpy_result[1]))
+
+        # test case for basic test case 4 with tensor largest
+        k_tensor = paddle.to_tensor(np.array([2]))
+        paddle_result = paddle.topk(input_tensor, k=2, axis=1, largest=False)
+        numpy_result = numpy_topk(self.input_data, k=2, axis=1, largest=False)
+        self.assertTrue(np.allclose(paddle_result[0].numpy(), numpy_result[0]))
+        self.assertTrue(np.allclose(paddle_result[1].numpy(), numpy_result[1]))
+
+        # test case for basic test case 5 with axis -1
+        k_tensor = paddle.to_tensor(np.array([2]))
+        paddle_result = paddle.topk(input_tensor, k=2, axis=-1, largest=False)
+        numpy_result = numpy_topk(self.input_data, k=2, axis=-1, largest=False)
+        self.assertTrue(np.allclose(paddle_result[0].numpy(), numpy_result[0]))
+        self.assertTrue(np.allclose(paddle_result[1].numpy(), numpy_result[1]))
+
+        # test case for basic test case 6 for the partial sort 
+        paddle_result = paddle.topk(large_input_tensor, k=1, axis=-1)
+        numpy_result = numpy_topk(self.large_input_data, k=1, axis=-1)
+        self.assertTrue(np.allclose(paddle_result[0].numpy(), numpy_result[0]))
+        self.assertTrue(np.allclose(paddle_result[1].numpy(), numpy_result[1]))
+        # test case for basic test case 7 for the unsorted 
+        paddle_result = paddle.topk(input_tensor, k=2, axis=1, sorted=False)
+        sort_paddle = numpy_topk(
+            np.array(paddle_result[0].numpy()), axis=1, k=2)
+        numpy_result = numpy_topk(self.input_data, k=2, axis=1)
+        self.assertTrue(np.allclose(sort_paddle[0], numpy_result[0]))
+
+    def run_static(self, place):
+        paddle.enable_static()
+        with paddle.static.program_guard(paddle.static.Program(),
+                                         paddle.static.Program()):
+            input_tensor = paddle.static.data(
+                name="x", shape=[6, 7, 8], dtype="float64")
+            large_input_tensor = paddle.static.data(
+                name="large_x", shape=[2, 1030], dtype="float64")
+            k_tensor = paddle.static.data(name="k", shape=[1], dtype="int32")
+            result1 = paddle.topk(input_tensor, k=2)
+            result2 = paddle.topk(input_tensor, k=2, axis=-1)
+            result3 = paddle.topk(input_tensor, k=k_tensor, axis=1)
+            self.assertEqual(result3[0].shape, (6, -1, 8))
+            self.assertEqual(result3[1].shape, (6, -1, 8))
+            result4 = paddle.topk(input_tensor, k=2, axis=1, largest=False)
+            result5 = paddle.topk(input_tensor, k=2, axis=-1, largest=False)
+            result6 = paddle.topk(large_input_tensor, k=1, axis=-1)
+            result7 = paddle.topk(input_tensor, k=2, axis=1, sorted=False)
+            exe = paddle.static.Executor(place)
+            input_data = np.random.rand(10, 20).astype("float64")
+            large_input_data = np.random.rand(2, 100).astype("float64")
+            paddle_result = exe.run(
+                feed={
+                    "x": self.input_data,
+                    "large_x": self.large_input_data,
+                    "k": np.array([2]).astype("int32")
+                },
+                fetch_list=[
+                    result1[0], result1[1], result2[0], result2[1], result3[0],
+                    result3[1], result4[0], result4[1], result5[0], result5[1],
+                    result6[0], result6[1], result7[0], result7[1]
+                ])
+            numpy_result = numpy_topk(self.input_data, k=2)
+            self.assertTrue(np.allclose(paddle_result[0], numpy_result[0]))
+            self.assertTrue(np.allclose(paddle_result[1], numpy_result[1]))
+
+            numpy_result = numpy_topk(self.input_data, k=2, axis=-1)
+            self.assertTrue(np.allclose(paddle_result[2], numpy_result[0]))
+            self.assertTrue(np.allclose(paddle_result[3], numpy_result[1]))
+
+            numpy_result = numpy_topk(self.input_data, k=2, axis=1)
+            self.assertTrue(np.allclose(paddle_result[4], numpy_result[0]))
+            self.assertTrue(np.allclose(paddle_result[5], numpy_result[1]))
+
+            numpy_result = numpy_topk(
+                self.input_data, k=2, axis=1, largest=False)
+            self.assertTrue(np.allclose(paddle_result[6], numpy_result[0]))
+            self.assertTrue(np.allclose(paddle_result[7], numpy_result[1]))
+
+            numpy_result = numpy_topk(
+                self.input_data, k=2, axis=-1, largest=False)
+            self.assertTrue(np.allclose(paddle_result[8], numpy_result[0]))
+            self.assertTrue(np.allclose(paddle_result[9], numpy_result[1]))
+
+            numpy_result = numpy_topk(self.large_input_data, k=1, axis=-1)
+            self.assertTrue(np.allclose(paddle_result[10], numpy_result[0]))
+            self.assertTrue(np.allclose(paddle_result[11], numpy_result[1]))
+            sort_paddle = numpy_topk(paddle_result[12], axis=1, k=2)
+            numpy_result = numpy_topk(self.input_data, k=2, axis=1)
+            self.assertTrue(np.allclose(sort_paddle[0], numpy_result[0]))
+
+    def test_cases(self):
+        places = [core.NPUPlace(0)]
+        for place in places:
+            self.run_dygraph(place)
+            self.run_static(place)
+
+    def test_errors(self):
+        self.__class__.use_npu = True
+        self.place = paddle.NPUPlace(0)
+        paddle.disable_static()
+        x = paddle.to_tensor([1, 2, 3])
+        with self.assertRaises(BaseException):
+            paddle.topk(x, k=-1)
+
+        with self.assertRaises(BaseException):
+            paddle.topk(x, k=0)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/rnn/test_rnn_cells.py b/python/paddle/fluid/tests/unittests/rnn/test_rnn_cells.py
index ab1127afa58dd9..cade4b850cd1d6 100644
--- a/python/paddle/fluid/tests/unittests/rnn/test_rnn_cells.py
+++ b/python/paddle/fluid/tests/unittests/rnn/test_rnn_cells.py
@@ -60,9 +60,16 @@ def test_with_zero_state(self):
         y2, h2 = rnn2(paddle.to_tensor(x))
         np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
 
+    def test_errors(self):
+        def test_zero_hidden_size():
+            cell = paddle.nn.SimpleRNNCell(-1, 0)
+
+        self.assertRaises(ValueError, test_zero_hidden_size)
+
     def runTest(self):
         self.test_with_initial_state()
         self.test_with_zero_state()
+        self.test_errors()
 
 
 class TestGRUCell(unittest.TestCase):
@@ -103,9 +110,16 @@ def test_with_zero_state(self):
         y2, h2 = rnn2(paddle.to_tensor(x))
         np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
 
+    def test_errors(self):
+        def test_zero_hidden_size():
+            cell = paddle.nn.GRUCell(-1, 0)
+
+        self.assertRaises(ValueError, test_zero_hidden_size)
+
     def runTest(self):
         self.test_with_initial_state()
         self.test_with_zero_state()
+        self.test_errors()
 
 
 class TestLSTMCell(unittest.TestCase):
@@ -150,9 +164,16 @@ def test_with_zero_state(self):
         np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
         np.testing.assert_allclose(c1, c2.numpy(), atol=1e-8, rtol=1e-5)
 
+    def test_errors(self):
+        def test_zero_hidden_size():
+            cell = paddle.nn.LSTMCell(-1, 0)
+
+        self.assertRaises(ValueError, test_zero_hidden_size)
+
     def runTest(self):
         self.test_with_initial_state()
         self.test_with_zero_state()
+        self.test_errors()
 
 
 def load_tests(loader, tests, pattern):
diff --git a/python/paddle/fluid/tests/unittests/test_backward.py b/python/paddle/fluid/tests/unittests/test_backward.py
index 7ca0832b718fd0..e0d6a606e2569c 100644
--- a/python/paddle/fluid/tests/unittests/test_backward.py
+++ b/python/paddle/fluid/tests/unittests/test_backward.py
@@ -16,6 +16,9 @@
 
 import unittest
 import paddle.fluid as fluid
+import paddle.static as static
+import paddle
+
 import numpy as np
 
 
@@ -327,6 +330,35 @@ def callback(block, context):
                 loss=self.avg_loss, callbacks=callback)
 
 
+class TestGradientsWithOptimizer(unittest.TestCase):
+    def _check_grad_op_name(self, forward_list, optimiezed_list):
+        backward_list = [op + "_grad" for op in reversed(forward_list)]
+        idx = optimiezed_list.index(backward_list[0], len(backward_list))
+
+        self.assertListEqual(backward_list,
+                             optimiezed_list[idx:idx + len(backward_list)])
+
+    def test_gradient_with_optimizer(self):
+        main = fluid.Program()
+        startup = fluid.Program()
+
+        with fluid.program_guard(main, startup):
+            img = static.data(name='image', shape=[None, 784])
+            pred = static.nn.fc(x=img, size=10, activation='relu')
+            loss = paddle.mean(pred)
+            opt = paddle.optimizer.Momentum(learning_rate=0.01, momentum=0.9)
+
+            forward_list = [o.type for o in main.current_block().ops]
+            optimize_ops, pram_grads = paddle.autograd.backward_mode.gradients_with_optimizer(
+                main, opt)
+
+            optimized_list = [o.type for o in main.current_block().ops]
+
+            self.assertGreater(len(optimized_list), len(forward_list))
+            self.assertIn(opt.type, optimized_list)
+            self._check_grad_op_name(forward_list, optimized_list)
+
+
 # TODO(Aurelius84): add conditional network test
 class ConditionalNet(BackwardNet):
     def __init__(self):
@@ -334,4 +366,5 @@ def __init__(self):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_flatten_contiguous_range_op.py b/python/paddle/fluid/tests/unittests/test_flatten_contiguous_range_op.py
index bc9ff3697717d1..f87b732d1b2cc0 100644
--- a/python/paddle/fluid/tests/unittests/test_flatten_contiguous_range_op.py
+++ b/python/paddle/fluid/tests/unittests/test_flatten_contiguous_range_op.py
@@ -1,4 +1,4 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_launch_elastic.sh b/python/paddle/fluid/tests/unittests/test_fleet_launch_elastic.sh
index 105ed1356ede3a..8b618195f55ea0 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_launch_elastic.sh
+++ b/python/paddle/fluid/tests/unittests/test_fleet_launch_elastic.sh
@@ -17,6 +17,15 @@ echo "begin test elastic"
 unset GREP_OPTIONS
 rm -rf log
 
+pids=`ps -ef | grep "python -m paddle.distributed.launch elastic_demo.[py]" | awk '{print $2}'`
+if [ -n "$pids" ]; then
+    echo $pids | xargs kill -9 
+fi
+pids=`ps -ef | grep "/usr/bin/python -u elastic_demo.[py]" | awk '{print $2}'`
+if [ -n "$pids" ]; then
+    echo $pids | xargs kill -9 
+fi
+
 python -m pip install --no-cache-dir etcd3 -i https://mirror.baidu.com/pypi/simple
 
 # common env
@@ -115,6 +124,8 @@ do
     fi
 done
 
+> $lw0
+
 # rerun node 1
 export NVIDIA_VISIBLE_DEVICES=1
 export CUDA_VISIBLE_DEVICES=1
@@ -144,5 +155,54 @@ done
 
 check_env
 
+> log_0.log
+
+for i in {1..10}
+do
+    ## kill with -9
+    kill -9 $p0
+    sleep 1
+    if [ `ps -p $p0 | wc -l` ==  "2" ]; then
+        echo "force stop node 0 error"
+        exit -1
+    else
+        echo "force stop node 0 ok"
+        break
+    fi
+done
+
+> $lw0
+
+# rerun node 0
+export NVIDIA_VISIBLE_DEVICES=0
+export CUDA_VISIBLE_DEVICES=0
+export DISTRIBUTED_TRAINER_ENDPOINTS=10.10.10.10:8001,10.10.10.3:8001
+export PADDLE_TRAINERS=10.10.10.10,10.10.10.3
+export TRAINER_PORTS_NUM=1
+export POD_IP=10.10.10.10
+export PADDLE_TRAINER_ID=0
+export PADDLE_TRAINERS_NUM=2
+
+python -m paddle.distributed.launch elastic_demo.py &> log_0.log &
+p0=$!
+
+for i in {1..10}
+do
+    if grep "INFO:ELASTIC:ready with hosts" log_1.log | grep -q '10.10.10.10'; then
+        echo "rerun node 0 ok"
+        break
+    else
+        sleep 1
+    fi
+    if [ $i -eq 10 ]; then
+        echo "rerun node 0 error"
+        exit -1
+    fi
+done
+
+check_env
+
+echo "All check done"
+
 sleep 3
 kill $p0 $p1
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_pipeline_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_pipeline_meta_optimizer.py
index a9c37d78537eec..3f8d994ad19e44 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_pipeline_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_pipeline_meta_optimizer.py
@@ -14,6 +14,10 @@
 
 import unittest
 import paddle
+import paddle.fluid as fluid
+import paddle.static as static
+import paddle.distributed.fleet as fleet
+import paddle.distributed.fleet.base.role_maker as role_maker
 import os
 
 paddle.enable_static()
@@ -25,26 +29,34 @@ def setUp(self):
         os.environ[
             "PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001,127.0.0.1:36002"
 
-    def test_pipeline_optimizer(self):
-        import paddle.distributed.fleet as fleet
-        import paddle.distributed.fleet.base.role_maker as role_maker
-        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
-        fleet.init(role)
-        with paddle.fluid.device_guard("gpu:0"):
+    def net(self):
+        with static.device_guard("gpu:0"):
             input_x = paddle.fluid.layers.data(
                 name="x", shape=[32], dtype='float32')
             input_y = paddle.fluid.layers.data(
                 name="y", shape=[1], dtype='int64')
+            input_z = paddle.fluid.layers.data(
+                name="z", shape=[1], dtype="float32")
+            with static.device_guard("gpu:all"):
+                input_z = input_z * 1.0
+                input_z.stop_gradient = True
             fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
+            fc_1 = fc_1 * input_z
 
-        with paddle.fluid.device_guard("gpu:1"):
+        with static.device_guard("gpu:1"):
             fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
+            fc_2 = fc_2 * input_z
             prediction = paddle.fluid.layers.fc(input=[fc_2],
                                                 size=2,
                                                 act='softmax')
             cost = paddle.fluid.layers.cross_entropy(
                 input=prediction, label=input_y)
             avg_cost = paddle.fluid.layers.mean(x=cost)
+        return avg_cost
+
+    def test_pipeline_optimizer(self):
+        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        fleet.init(role)
 
         strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.pipeline = True
@@ -53,9 +65,43 @@ def test_pipeline_optimizer(self):
             'accumulate_steps': 2
         }
 
-        optimizer = paddle.fluid.optimizer.Adam(0.01)
-        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
-        optimizer.minimize(avg_cost)
+        train_prog, startup_prog = static.Program(), static.Program()
+        with static.program_guard(train_prog, startup_prog):
+            with fluid.unique_name.guard():
+                avg_cost = self.net()
+
+                optimizer = paddle.fluid.optimizer.Adam(0.01)
+                optimizer = fleet.distributed_optimizer(
+                    optimizer, strategy=strategy)
+                optimizer.minimize(avg_cost)
+
+    def test_pipeline_amp_optimizer(self):
+        """ test pipeline&amp with device:all """
+        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        fleet.init(role)
+
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.amp = True
+        strategy.pipeline = True
+        strategy.pipeline_configs = {
+            'micro_batch_size': 1,
+            'accumulate_steps': 2
+        }
+
+        train_prog, startup_prog = static.Program(), static.Program()
+        with static.program_guard(train_prog, startup_prog):
+            with fluid.unique_name.guard():
+                avg_cost = self.net()
+
+                optimizer = paddle.fluid.optimizer.Adam(0.01)
+                optimizer = fleet.distributed_optimizer(
+                    optimizer, strategy=strategy)
+                optimizer.minimize(avg_cost)
+
+        ops = train_prog._pipeline_opt['section_program'].global_block().ops
+        ops = [op.type for op in ops]
+        self.assertEqual(ops.count('send_v2'), 1)
+        self.assertEqual(ops.count('recv_v2'), 1)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py
index 1387827736560e..b7cf9dfaec5760 100755
--- a/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py
@@ -264,8 +264,8 @@ def test_sharding_gradient_clip(self):
             'elementwise_add_grad', 'mul_grad', 'tanh_grad',
             'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream',
             'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum',
-            'c_reduce_sum', 'c_reduce_sum', 'c_sync_comm_stream', 'square',
-            'reduce_sum', 'square', 'reduce_sum', 'square', 'reduce_sum', 'sum',
+            'c_reduce_sum', 'c_reduce_sum', 'c_sync_comm_stream',
+            'squared_l2_norm', 'squared_l2_norm', 'squared_l2_norm', 'sum',
             'c_allreduce_sum', 'sqrt', 'fill_constant', 'elementwise_max',
             'elementwise_div', 'elementwise_mul', 'elementwise_mul',
             'elementwise_mul', 'momentum', 'momentum', 'momentum'
@@ -366,6 +366,8 @@ def test_sharding_hybrid_dp(self):
             "gradient_merge_acc_step": 1,
             "mp_degree": 1
         }
+
+        strategy.fuse_all_reduce_ops = False
         self.optimizer(avg_cost, strategy, train_prog, startup_prog)
         startup_prog_ops = startup_prog.global_block().ops
         main_prog_ops = train_prog.global_block().ops
diff --git a/python/paddle/fluid/tests/unittests/test_gast_with_compatibility.py b/python/paddle/fluid/tests/unittests/test_gast_with_compatibility.py
index 17ba6869534fe7..8404c563274b1e 100644
--- a/python/paddle/fluid/tests/unittests/test_gast_with_compatibility.py
+++ b/python/paddle/fluid/tests/unittests/test_gast_with_compatibility.py
@@ -15,7 +15,7 @@
 from __future__ import print_function
 
 import ast
-import gast
+from paddle.utils import gast
 import sys
 import textwrap
 import unittest
diff --git a/python/paddle/fluid/tests/unittests/test_gradient_clip.py b/python/paddle/fluid/tests/unittests/test_gradient_clip.py
index 14f5d4a41a1fed..9b6dbc00f7c565 100644
--- a/python/paddle/fluid/tests/unittests/test_gradient_clip.py
+++ b/python/paddle/fluid/tests/unittests/test_gradient_clip.py
@@ -22,6 +22,8 @@
 import six
 from fake_reader import fake_imdb_reader
 
+paddle.enable_static()
+
 
 def bow_net(data,
             label,
@@ -149,7 +151,7 @@ def clip_gradient(self, params_grads):
     def check_clip_result(self, out, out_clip):
         global_norm = 0
         for v in out:
-            global_norm += np.sum(np.power(v, 2))
+            global_norm += np.sum(np.square(v))
         global_norm = np.sqrt(global_norm)
         scale = self.clip_norm / np.maximum(self.clip_norm, global_norm)
         res = []
@@ -160,7 +162,8 @@ def check_clip_result(self, out, out_clip):
             self.assertTrue(
                 np.allclose(
                     a=u, b=v, rtol=1e-5, atol=1e-8),
-                "gradient clip by global norm has wrong results!")
+                "gradient clip by global norm has wrong results!, \nu={}\nv={}\ndiff={}".
+                format(u, v, u - v))
 
     # test whether the ouput is right when use 'set_gradient_clip'
     def test_old_gradient_clip(self):
@@ -210,12 +213,16 @@ def test_none_grad(self):
         params_grads = [(x, None), (x, y), (y, x)]
         params_grads = clip(params_grads)
         self.assertTrue(
-            len(clip(params_grads)) == 2,
+            len(params_grads) == 2,
             "ClipByGlobalNorm: when grad is None, it shouldn't be returned by gradient clip!"
         )
-        self.assertTrue(
-            params_grads[0][1].name != 'y',
-            "ClipByGlobalNorm: param_grad (x, y) should be clipped!")
+
+        ops = [op.type for op in x.block.ops]
+        self.assertListEqual(ops, [
+            'squared_l2_norm', 'squared_l2_norm', 'sum', 'sqrt',
+            'fill_constant', 'elementwise_max', 'elementwise_div',
+            'elementwise_mul', 'elementwise_mul'
+        ])
 
     # raise typeError
     def test_tpyeError(self):
diff --git a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
index 590c3e061f26ee..965ae65614a40a 100644
--- a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
+++ b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
@@ -575,6 +575,20 @@ def test_errors(self):
                 weight,
                 path_code=path_code_int32)
 
+        # test paddle.nn.HSigmoidLoss
+        paddle.disable_static(self.place)
+        x_arr = np.array([], dtype=np.float32)
+        x = paddle.to_tensor(np.reshape(x_arr, (100000, 0)))
+        label = paddle.to_tensor(0, dtype='int64')
+        self.assertRaises(ValueError, paddle.nn.HSigmoidLoss, x, label)
+
+        # test paddle.nn.functional.hsigmoid_loss
+        x = paddle.to_tensor(np.reshape(x_arr, (10, 0)), dtype='float32')
+        label = paddle.to_tensor([], dtype='int64')
+        weight = paddle.to_tensor([], dtype='float32')
+        self.assertRaises(ValueError, F.hsigmoid_loss, x, label, 0, weight)
+        paddle.enable_static()
+
         # test paddle.fluid.layers.hsigmoid
         with program_guard(Program()):
             label = fluid.data('label', [4, 1], 'int64')
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_thread_local_has_grad.py b/python/paddle/fluid/tests/unittests/test_imperative_thread_local_has_grad.py
new file mode 100644
index 00000000000000..d81849725d75aa
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_imperative_thread_local_has_grad.py
@@ -0,0 +1,59 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import time
+import paddle.nn as nn
+import numpy as np
+import threading
+
+
+class SimpleNet(nn.Layer):
+    def __init__(self, in_dim, out_dim):
+        super(SimpleNet, self).__init__()
+        self.fc = nn.Linear(in_dim, out_dim)
+
+    def forward(self, x):
+        return self.fc(x)
+
+
+class TestCases(unittest.TestCase):
+    @paddle.no_grad()
+    def thread_1_main(self):
+        time.sleep(8)
+
+    def thread_2_main(self):
+        in_dim = 10
+        out_dim = 3
+        net = SimpleNet(in_dim, out_dim)
+        for _ in range(1000):
+            x = paddle.to_tensor(np.random.rand(32, in_dim).astype('float32'))
+            self.assertTrue(x.stop_gradient)
+            x = net(x)
+            self.assertFalse(x.stop_gradient)
+
+    def test_main(self):
+        threads = []
+        for _ in range(10):
+            threads.append(threading.Thread(target=self.thread_1_main))
+        threads.append(threading.Thread(target=self.thread_2_main))
+        for t in threads:
+            t.start()
+        for t in threads:
+            t.join()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_fix_op_run_order.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_fix_op_run_order.py
index f48cfbd50eba35..24aa080e68c280 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_fix_op_run_order.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_fix_op_run_order.py
@@ -16,7 +16,7 @@
 import paddle.fluid as fluid
 import unittest
 import numpy as np
-from paddle.vision.models import resnet50
+from paddle.vision.models import resnet18
 from paddle.nn import CrossEntropyLoss
 
 
@@ -33,7 +33,7 @@ def get_place(self):
         ) else paddle.CPUPlace()
 
     def get_feed(self):
-        batch_size = 32
+        batch_size = 4
         image = np.random.random([batch_size, 3, 224, 224]).astype('float32')
         label = np.random.randint(0, 1000, [batch_size, 1]).astype('int64')
         return {"image": image, "label": label}
@@ -47,7 +47,7 @@ def create_model(self, fix_op_run_order):
                 name="image", shape=[None, 3, 224, 224], dtype="float32")
             label = paddle.static.data(
                 name="label", shape=[None, 1], dtype="int64")
-            model = resnet50()
+            model = resnet18()
             pred = model(image)
             loss_fn = CrossEntropyLoss()
             loss = loss_fn(pred, label)
diff --git a/python/paddle/fluid/tests/unittests/test_reduce_op.py b/python/paddle/fluid/tests/unittests/test_reduce_op.py
index 2dd5bcb8113648..047366145584e5 100644
--- a/python/paddle/fluid/tests/unittests/test_reduce_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reduce_op.py
@@ -748,37 +748,6 @@ def test_errors(self):
             self.assertRaises(TypeError, fluid.layers.reduce_sum, x2)
 
 
-class API_TestSumOpError(unittest.TestCase):
-    def test_errors(self):
-        def test_dtype1():
-            with fluid.program_guard(fluid.Program(), fluid.Program()):
-                data = fluid.data(name="data", shape=[10], dtype="float64")
-                paddle.sum(data, dtype="float32")
-
-        self.assertRaises(ValueError, test_dtype1)
-
-        def test_dtype2():
-            with fluid.program_guard(fluid.Program(), fluid.Program()):
-                data = fluid.data(name="data", shape=[10], dtype="int64")
-                paddle.sum(data, dtype="int32")
-
-        self.assertRaises(ValueError, test_dtype2)
-
-        def test_dtype3():
-            with fluid.program_guard(fluid.Program(), fluid.Program()):
-                data = fluid.data(name="data", shape=[10], dtype="float64")
-                paddle.sum(data, dtype="int32")
-
-        self.assertRaises(ValueError, test_dtype3)
-
-        def test_type():
-            with fluid.program_guard(fluid.Program(), fluid.Program()):
-                data = fluid.data(name="data", shape=[10], dtype="int32")
-                paddle.sum(data, dtype="bool")
-
-        self.assertRaises(TypeError, test_type)
-
-
 class API_TestSumOp(unittest.TestCase):
     def run_static(self,
                    shape,
@@ -805,14 +774,26 @@ def test_static(self):
         shape = [10, 10]
         axis = 1
 
+        self.run_static(shape, "bool", axis, attr_dtype=None)
+        self.run_static(shape, "bool", axis, attr_dtype="int32")
+        self.run_static(shape, "bool", axis, attr_dtype="int64")
+
         self.run_static(shape, "int32", axis, attr_dtype=None)
         self.run_static(shape, "int32", axis, attr_dtype="int32")
         self.run_static(shape, "int32", axis, attr_dtype="int64")
 
+        self.run_static(shape, "int64", axis, attr_dtype=None)
+        self.run_static(shape, "int64", axis, attr_dtype="int64")
+        self.run_static(shape, "int64", axis, attr_dtype="int32")
+
         self.run_static(shape, "float32", axis, attr_dtype=None)
         self.run_static(shape, "float32", axis, attr_dtype="float32")
         self.run_static(shape, "float32", axis, attr_dtype="float64")
 
+        self.run_static(shape, "float64", axis, attr_dtype=None)
+        self.run_static(shape, "float64", axis, attr_dtype="float32")
+        self.run_static(shape, "float64", axis, attr_dtype="float64")
+
         shape = [5, 5, 5]
         self.run_static(shape, "int32", (0, 1), attr_dtype="int32")
         self.run_static(
diff --git a/python/paddle/nn/functional/activation.py b/python/paddle/nn/functional/activation.py
index 64d6910e1f859c..7228c903d6ffa7 100644
--- a/python/paddle/nn/functional/activation.py
+++ b/python/paddle/nn/functional/activation.py
@@ -37,7 +37,7 @@ def elu(x, alpha=1.0, name=None):
 
     .. math::
 
-        elu(x) = max(0, x) + min(0, \\alpha * (e^{x}-1))
+        elu(x) = max(0, x) + min(0, \alpha * (e^{x}-1))
 
     Parameters:
         x (Tensor): The input Tensor with data type float32, float64.
@@ -91,13 +91,13 @@ def gelu(x, approximate=False, name=None):
 
     .. math::
 
-        gelu(x) = 0.5 * x * (1 + tanh(\\sqrt{\\frac{2}{\\pi}} * (x + 0.044715x^{3})))
+        gelu(x) = 0.5 * x * (1 + tanh(\sqrt{\frac{2}{\pi}} * (x + 0.044715x^{3})))
 
     else
 
     .. math::
 
-        gelu(x) = 0.5 * x * (1 + erf(\\frac{x}{\\sqrt{2}}))
+        gelu(x) = 0.5 * x * (1 + erf(\frac{x}{\sqrt{2}}))
 
     Parameters:
         x (Tensor): The input Tensor with data type float32, float64.
@@ -144,13 +144,13 @@ def hardshrink(x, threshold=0.5, name=None):
     .. math::
 
         hardshrink(x)=
-            \\left\\{
-            \\begin{aligned}
-            &x, & & if \\ x > threshold \\\\
-            &x, & & if \\ x < -threshold \\\\
-            &0, & & if \\ others
-            \\end{aligned}
-            \\right.
+            \left\{
+                \begin{array}{rcl}
+                x,&  &if \ {x > threshold}  \\
+                x,&  &if \ {x < -threshold}   \\
+                0,&  &if \ {others} &
+                \end{array}
+            \right.
 
     Args:
         x (Tensor): The input Tensor with data type float32, float64.
@@ -192,11 +192,14 @@ def hardtanh(x, min=-1.0, max=1.0, name=None):
 
     .. math::
 
-        hardtanh(x)= \\begin{cases}
-                        max, \\text{if } x > max \\\\
-                        min, \\text{if } x < min \\\\
-                        x,  \\text{otherwise}
-                      \\end{cases}
+        hardtanh(x)=
+            \left\{
+                \begin{array}{cll}
+                    max,& & \text{if } x > max \\
+                    min,& & \text{if } x < min \\
+                    x,& & \text{otherwise}
+                \end{array}
+            \right.
 
     Parameters:
         x (Tensor): The input Tensor with data type float32, float64.
@@ -246,13 +249,13 @@ def hardsigmoid(x, slope=0.1666667, offset=0.5, name=None):
     .. math::
 
         hardsigmoid(x)=
-            \\left\\{
-            \\begin{aligned}
-            &0, & & \\text{if } x \\leq -3 \\\\
-            &1, & & \\text{if } x \\geq 3 \\\\
-            &slope * x + offset, & & \\text{otherwise}
-            \\end{aligned}
-            \\right.
+            \left\{
+                \begin{array}{lcl}
+                0, & &\text{if } \ x \leq -3 \\
+                1, & &\text{if } \ x \geq 3 \\
+                slope * x + offset, & &\text{otherwise}
+                \end{array}
+            \right.
 
     Parameters:
         x (Tensor): The input Tensor with data type float32, float64.
@@ -302,13 +305,13 @@ def hardswish(x, name=None):
     .. math::
 
         hardswish(x)=
-            \\left\\{
-            \\begin{aligned}
-            &0, & & \\text{if } x \\leq -3 \\\\
-            &x, & & \\text{if } x \\geq 3 \\\\
-            &\\frac{x(x+3)}{6}, & & \\text{otherwise}
-            \\end{aligned}
-            \\right.
+            \left\{
+                \begin{array}{cll}
+                0 &, & \text{if } x \leq -3 \\
+                x &, & \text{if } x \geq 3 \\
+                \frac{x(x+3)}{6} &, & \text{otherwise}
+                \end{array}
+            \right.
 
     Parameters:
         x (Tensor): The input Tensor with data type float32, float64.
@@ -345,13 +348,13 @@ def leaky_relu(x, negative_slope=0.01, name=None):
     leaky_relu activation
 
     .. math::
-        leaky\\_relu(x)=
-            \\left\\{
-            \\begin{aligned}
-            &x, & & if \\ x >= 0 \\\\
-            &negative\_slope * x, & & otherwise \\\\
-            \\end{aligned}
-            \\right. \\\\
+        leaky\_relu(x)=
+        \left\{
+            \begin{array}{rcl}
+                x, & & if \ x >= 0 \\
+                negative\_slope * x, & & otherwise \\
+            \end{array}
+        \right.
 
     Args:
         x (Tensor): The input Tensor with data type float32, float64.
@@ -513,7 +516,7 @@ def log_sigmoid(x, name=None):
 
     .. math::
 
-        log\\_sigmoid(x) = log \\frac{1}{1 + e^{-x}}
+        log\_sigmoid(x) = log \frac{1}{1 + e^{-x}}
 
     Parameters:
         x (Tensor): The input Tensor with data type float32, float64.
@@ -554,12 +557,15 @@ def maxout(x, groups, axis=1, name=None):
 
     .. math::
 
-        &out_{si+j} = \\max_{k} x_{gsi + sk + j} \\\\
-        &g = groups \\\\
-        &s = \\frac{input.size}{num\\_channels} \\\\
-        &0 \\le i < \\frac{num\\_channels}{groups} \\\\
-        &0 \\le j < s \\\\
-        &0 \\le k < groups
+        \begin{array}{l}
+        &out_{si+j} = \max_{k} x_{gsi + sk + j} \\
+        &g = groups \\
+        &s = \frac{input.size}{num\_channels} \\
+        &0 \le i < \frac{num\_channels}{groups} \\
+        &0 \le j < s \\
+        &0 \le k < groups
+        \end{array}
+
 
     Parameters:
         x (Tensor): The input is 4-D Tensor with shape [N, C, H, W] or [N, H, W, C], the data type
@@ -670,10 +676,12 @@ def selu(x,
     .. math::
 
         selu(x)= scale *
-                 \\begin{cases}
-                   x, \\text{if } x > 0 \\\\
-                   alpha * e^{x} - alpha, \\text{if } x <= 0
-                 \\end{cases}
+            \left\{
+                \begin{array}{lcl}
+                x,& &\text{if } \ x > 0 \\
+                alpha * e^{x} - alpha,& &\text{if } \ x <= 0
+                \end{array}
+            \right.
 
     Parameters:
         x (Tensor): The input Tensor with data type float32, float64.
@@ -719,9 +727,11 @@ def selu(x,
 
 
 def silu(x, name=None):
-    """
-    silu activation.
-    .. math:
+    r"""
+    silu activation
+
+    .. math::
+
         silu(x) = \frac{x}{1 + e^{-x}}
     
     Parameters:
@@ -734,11 +744,12 @@ def silu(x, name=None):
     
     Examples:
         .. code-block:: python
-        import paddle
-        import paddle.nn.functional as F
-        
-        x = paddle.to_tensor([1.0, 2.0, 3.0, 4.0])
-        out = F.silu(x) # [ 0.731059, 1.761594, 2.857722, 3.928055 ]
+
+            import paddle
+            import paddle.nn.functional as F
+            
+            x = paddle.to_tensor([1.0, 2.0, 3.0, 4.0])
+            out = F.silu(x) # [ 0.731059, 1.761594, 2.857722, 3.928055 ]
     """
 
     if in_dygraph_mode():
@@ -778,7 +789,7 @@ def softmax(x, axis=-1, dtype=None, name=None):
 
     .. math::
 
-        softmax[i, j] = \\frac{\\exp(x[i, j])}{\\sum_j(exp(x[i, j])}
+        softmax[i, j] = \frac{\exp(x[i, j])}{\sum_j(exp(x[i, j])}
 
     Example:
 
@@ -923,8 +934,8 @@ def softplus(x, beta=1, threshold=20, name=None):
 
     .. math::
 
-        softplus(x) = \\frac{1}{beta} * \\log(1 + e^{beta * x}) \\\\
-        \\text{For numerical stability, the implementation reverts to the linear function when: beta * x > threshold.}
+        softplus(x) = \frac{1}{beta} * \log(1 + e^{beta * x}) \\
+        \text{For numerical stability, the implementation reverts to the linear function when: beta * x > threshold.}
 
     Parameters:
         x (Tensor): The input Tensor with data type float32, float64.
@@ -968,11 +979,14 @@ def softshrink(x, threshold=0.5, name=None):
 
     .. math::
 
-        softshrink(x)= \\begin{cases}
-                        x - threshold, \\text{if } x > threshold \\\\
-                        x + threshold, \\text{if } x < -threshold \\\\
-                        0,  \\text{otherwise}
-                      \\end{cases}
+        softshrink(x)= 
+            \left\{
+                \begin{array}{rcl}
+                x - threshold,& & \text{if } x > threshold \\
+                x + threshold,& & \text{if } x < -threshold \\
+                0,& &  \text{otherwise}
+            \end{array}
+            \right.
 
     Parameters:
         x (Tensor): The input Tensor with data type float32, float64.
@@ -1019,7 +1033,7 @@ def softsign(x, name=None):
 
     .. math::
 
-        softsign(x) = \\frac{x}{1 + |x|}
+        softsign(x) = \frac{x}{1 + |x|}
 
     Parameters:
         x (Tensor): The input Tensor with data type float32, float64.
@@ -1056,7 +1070,7 @@ def swish(x, name=None):
 
     .. math::
 
-        swish(x) = \\frac{x}{1 + e^{-x}}
+        swish(x) = \frac{x}{1 + e^{-x}}
 
     Parameters:
         x (Tensor): The input Tensor with data type float32, float64.
@@ -1134,10 +1148,14 @@ def thresholded_relu(x, threshold=1.0, name=None):
 
     .. math::
 
-        thresholded\\_relu(x) = \\begin{cases}
-                                 x, \\text{if } x > threshold \\\\
-                                 0, \\text{otherwise}
-                                \\end{cases}
+        thresholded\_relu(x) = 
+            \left\{
+                \begin{array}{rl}
+                x,& \text{if } \ x > threshold \\
+                0,& \text{otherwise}
+                \end{array}
+            \right.
+
 
     Parameters:
         x (Tensor): The input Tensor with data type float32, float64.
@@ -1181,10 +1199,10 @@ def log_softmax(x, axis=-1, dtype=None, name=None):
 
     .. math::
 
-        \\begin{aligned} 
-        log\\_softmax[i, j] &= log(softmax(x)) \\\\
-        &= log(\\frac{\\exp(X[i, j])}{\\sum_j(\\exp(X[i, j])})
-        \\end{aligned}
+        \begin{aligned} 
+        log\_softmax[i, j] &= log(softmax(x)) \\
+        &= log(\frac{\exp(X[i, j])}{\sum_j(\exp(X[i, j])})
+        \end{aligned}
 
     Parameters:
         x (Tensor): The input Tensor with data type float32, float64.
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index cb7a50ade7ac8f..ef2bfb3b8e0d3a 100755
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -180,18 +180,18 @@ def binary_cross_entropy_with_logits(logit,
     First this operator calculate loss function as follows:
 
     .. math::
-           Out = -Labels * \\log(\\sigma(Logit)) - (1 - Labels) * \\log(1 - \\sigma(Logit))
+           Out = -Labels * \log(\sigma(Logit)) - (1 - Labels) * \log(1 - \sigma(Logit))
 
-    We know that :math:`\\sigma(Logit) = \\frac{1}{1 + e^{-Logit}}`. By substituting this we get:
+    We know that :math:`\sigma(Logit) = \frac{1}{1 + e^{-Logit}}`. By substituting this we get:
 
     .. math::
-           Out = Logit - Logit * Labels + \\log(1 + e^{-Logit})
+           Out = Logit - Logit * Labels + \log(1 + e^{-Logit})
 
     For stability and to prevent overflow of :math:`e^{-Logit}` when Logit < 0,
     we reformulate the loss as follows:
 
     .. math::
-           Out = \\max(Logit, 0) - Logit * Labels + \\log(1 + e^{-\|Logit\|})
+           Out = \max(Logit, 0) - Logit * Labels + \log(1 + e^{-\|Logit\|})
 
     Then, if ``weight`` or ``pos_weight`` is not None, this operator multiply the
     weight tensor on the loss `Out`. The ``weight`` tensor will attach different
@@ -450,17 +450,17 @@ def smooth_l1_loss(input, label, reduction='mean', delta=1.0, name=None):
 
     .. math::
 
-         loss(x,y) = \\frac{1}{n}\\sum_{i}z_i
+         loss(x,y) = \frac{1}{n}\sum_{i}z_i
 
 
     where z_i is given by:
 
     .. math::
 
-         \\mathop{z_i} = \\left\\{\\begin{array}{rcl}
-        0.5(x_i - y_i)^2 & & {if |x_i - y_i| < delta} \\\\
+        \mathop{z_i} = \left\{\begin{array}{rcl}
+        0.5(x_i - y_i)^2 & & {if |x_i - y_i| < delta} \\
         delta * |x_i - y_i| - 0.5 * delta^2 & & {otherwise}
-        \\end{array} \\right.
+        \end{array} \right.
 
     Parameters:
         input (Tensor): Input tensor, the data type is float32 or float64. Shape is
@@ -631,17 +631,17 @@ def l1_loss(input, label, reduction='mean', name=None):
     If `reduction` set to ``'none'``, the loss is:
 
     .. math::
-        Out = \\lvert input - label \\rvert
+        Out = \lvert input - label \rvert
 
     If `reduction` set to ``'mean'``, the loss is:
 
     .. math::
-        Out = MEAN(\\lvert input - label \\rvert)
+        Out = MEAN(\lvert input - label \rvert)
 
     If `reduction` set to ``'sum'``, the loss is:
 
     .. math::
-        Out = SUM(\\lvert input - label\\rvert)
+        Out = SUM(\lvert input - label \rvert)
 
 
     Parameters:
@@ -1563,15 +1563,15 @@ def sigmoid_focal_loss(logit,
     This operator measures focal loss function as follows: 
 
     .. math::
-           Out = -Labels * alpha * {(1 - \\sigma(Logit))}^{gamma}\\log(\\sigma(Logit)) - (1 - Labels) * (1 - alpha) * {\\sigma(Logit)}^{gamma}\\log(1 - \\sigma(Logit))
+           Out = -Labels * alpha * {(1 - \sigma(Logit))}^{gamma}\log(\sigma(Logit)) - (1 - Labels) * (1 - alpha) * {\sigma(Logit)}^{gamma}\log(1 - \sigma(Logit))
 
-    We know that :math:`\\sigma(Logit) = \\frac{1}{1 + \\exp(-Logit)}`. 
+    We know that :math:`\sigma(Logit) = \frac{1}{1 + \exp(-Logit)}`. 
 
     Then, if :attr:`normalizer` is not None, this operator divides the
     normalizer tensor on the loss `Out`:
 
     .. math::
-           Out = \\frac{Out}{normalizer}
+           Out = \frac{Out}{normalizer}
 
     Finally, this operator applies reduce operation on the loss.
     If :attr:`reduction` set to ``'none'``, the operator will return the original loss `Out`.
diff --git a/python/paddle/nn/functional/norm.py b/python/paddle/nn/functional/norm.py
index 286f8ef167b457..db73e56f879a77 100644
--- a/python/paddle/nn/functional/norm.py
+++ b/python/paddle/nn/functional/norm.py
@@ -34,12 +34,12 @@ def normalize(x, p=2, axis=1, epsilon=1e-12, name=None):
 
     .. math::
 
-        y = \\frac{x}{ \\max\\left( \\lvert \\lvert x \\rvert \\rvert_p, epsilon\\right) }
+        y = \frac{x}{ \max\left( \lvert \lvert x \rvert \rvert_p, epsilon\right) }
     
     .. math::
-        \\lvert \\lvert x \\rvert \\rvert_p = \\left( \\sum_i {\\lvert x_i \\rvert^p}  \\right)^{1/p}
+        \lvert \lvert x \rvert \rvert_p = \left( \sum_i {\lvert x_i \rvert^p}  \right)^{1/p}
 
-    where, :math:`\\sum_i{\\lvert x_i \\rvert^p}` is calculated along the ``axis`` dimension.
+    where, :math:`\sum_i{\lvert x_i \rvert^p}` is calculated along the ``axis`` dimension.
 
 
     Parameters:
@@ -432,7 +432,7 @@ def local_response_norm(x,
 
         .. math::
 
-            Output(i, x, y) = Input(i, x, y) / \\left(k + \\alpha \\sum\\limits^{\\min(C-1, i + size/2)}_{j = \\max(0, i - size/2)}(Input(j, x, y))^2\\right)^{\\beta}
+            Output(i, x, y) = Input(i, x, y) / \left(k + \alpha \sum\limits^{\min(C-1, i + size/2)}_{j = \max(0, i - size/2)}(Input(j, x, y))^2\right)^{\beta}
 
         In the above equation:
 
diff --git a/python/paddle/nn/initializer/kaiming.py b/python/paddle/nn/initializer/kaiming.py
index f0847c85237b25..88a52268776fcb 100644
--- a/python/paddle/nn/initializer/kaiming.py
+++ b/python/paddle/nn/initializer/kaiming.py
@@ -33,7 +33,7 @@ class KaimingNormal(MSRAInitializer):
 
     .. math::
 
-        \sqrt{\\frac{2.0}{fan\_in}}
+        \sqrt{\frac{2.0}{fan\_in}}
 
     Args:
         fan_in (float32|None): fan_in for Kaiming normal Initializer. If None, it is\
@@ -75,7 +75,7 @@ class KaimingUniform(MSRAInitializer):
 
     .. math::
 
-        x = \sqrt{\\frac{6.0}{fan\_in}}
+        x = \sqrt{\frac{6.0}{fan\_in}}
 
     Args:
         fan_in (float32|None): fan_in for Kaiming uniform Initializer. If None, it is\
diff --git a/python/paddle/nn/initializer/xavier.py b/python/paddle/nn/initializer/xavier.py
index f2d5593032f64d..aff3a2c15aeec3 100644
--- a/python/paddle/nn/initializer/xavier.py
+++ b/python/paddle/nn/initializer/xavier.py
@@ -28,7 +28,7 @@ class XavierNormal(XavierInitializer):
 
     .. math::
 
-        \sqrt{\\frac{2.0}{fan\_in + fan\_out}}
+        \sqrt{\frac{2.0}{fan\_in + fan\_out}}
 
 
     Args:
@@ -83,7 +83,7 @@ class XavierUniform(XavierInitializer):
 
     .. math::
 
-        x = \sqrt{\\frac{6.0}{fan\_in + fan\_out}}
+        x = \sqrt{\frac{6.0}{fan\_in + fan\_out}}
 
     Args:
         fan_in (float, optional): fan_in for Xavier initialization, it is
diff --git a/python/paddle/nn/layer/activation.py b/python/paddle/nn/layer/activation.py
index 695e387bda84f0..abfeff0641a472 100644
--- a/python/paddle/nn/layer/activation.py
+++ b/python/paddle/nn/layer/activation.py
@@ -31,7 +31,7 @@ class ELU(Layer):
 
     .. math::
 
-        ELU(x) = max(0, x) + min(0, \\alpha * (e^{x}-1))
+        ELU(x) = max(0, x) + min(0, \alpha * (e^{x}-1))
 
     Parameters:
         alpha (float, optional): The 'alpha' value of the ELU formulation. Default is 1.0.
@@ -75,13 +75,13 @@ class GELU(Layer):
 
     .. math::
 
-        GELU(x) = 0.5 * x * (1 + tanh(\\sqrt{\\frac{2}{\\pi}} * (x + 0.044715x^{3})))
+        GELU(x) = 0.5 * x * (1 + tanh(\sqrt{\frac{2}{\pi}} * (x + 0.044715x^{3})))
 
     else
 
     .. math::
 
-        GELU(x) = 0.5 * x * (1 + erf(\\frac{x}{\\sqrt{2}}))
+        GELU(x) = 0.5 * x * (1 + erf(\frac{x}{\sqrt{2}}))
 
     Parameters:
         approximate (bool, optional): Wether to enable approximation. Default is False.
@@ -127,13 +127,13 @@ class Hardshrink(Layer):
     .. math::
 
         hardshrink(x)=
-            \\left\\{
-            \\begin{aligned}
-            &x, & & if \\ x > threshold \\\\
-            &x, & & if \\ x < -threshold \\\\
-            &0, & & if \\ others
-            \\end{aligned}
-            \\right.
+            \left\{
+                \begin{array}{rcl}
+                    x, & & if \ x > threshold \\
+                    x, & & if \ x < -threshold \\
+                    0, & & if \ others
+            \end{array}
+            \right.
 
     Parameters:
         threshold (float, optional): The value of threshold for hardthrink. Default is 0.5
@@ -179,13 +179,14 @@ class Hardswish(Layer):
     .. math::
 
         Hardswish(x)=
-            \\left\\{
-            \\begin{aligned}
-            &0, & & \\text{if } x \\leq -3 \\\\
-            &x, & & \\text{if } x \\geq 3 \\\\
-            &\\frac{x(x+3)}{6}, & & \\text{otherwise}
-            \\end{aligned}
-            \\right.
+            \left\{
+                \begin{array}{cll}
+                0 &, & \text{if } x \leq -3 \\
+                x &, & \text{if } x \geq 3 \\
+                \frac{x(x+3)}{6} &, & \text{otherwise}
+                \end{array}
+            \right.
+            
 
     Parameters:
         name (str, optional): Name for the operation (optional, default is None).
@@ -223,7 +224,7 @@ class Tanh(Layer):
     Tanh Activation.
 
     .. math::
-        Tanh(x) = \\frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}
+        Tanh(x) = \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}
 
     Parameters:
         name (str, optional): Name for the operation (optional, default is None).
@@ -265,11 +266,15 @@ class Hardtanh(Layer):
 
     .. math::
 
-        Hardtanh(x)= \\begin{cases}
-                        max, \\text{if } x > max \\\\
-                        min, \\text{if } x < min \\\\
-                        x,  \\text{otherwise}
-                      \\end{cases}
+        Hardtanh(x)=
+            \left\{
+                \begin{array}{cll}
+                    max,& & \text{if } x > max \\
+                    min,& & \text{if } x < min \\
+                    x,& & \text{otherwise}
+                \end{array}
+            \right.
+
 
     Parameters:
         min (float, optional): The value of min for Hardtanh. Default is -1.
@@ -461,10 +466,12 @@ class SELU(Layer):
     .. math::
 
         SELU(x)= scale *
-                 \\begin{cases}
-                   x, \\text{if } x > 0 \\\\
-                   alpha * e^{x} - alpha, \\text{if } x <= 0
-                 \\end{cases}
+            \left\{
+                \begin{array}{lcl}
+                x,& &\text{if } \ x > 0 \\
+                alpha * e^{x} - alpha,& &\text{if } \ x <= 0
+                \end{array}
+            \right.
 
     Parameters:
         scale (float, optional): The value of scale(must be greater than 1.0) for SELU. Default is 1.0507009873554804934193349852946
@@ -512,12 +519,13 @@ class LeakyReLU(Layer):
     .. math::
 
         LeakyReLU(x)=
-            \\left\\{
-            \\begin{aligned}
-            &x, & & if \\ x >= 0 \\\\
-            &negative\_slope * x, & & otherwise \\\\
-            \\end{aligned}
-            \\right. \\\\
+            \left\{
+                \begin{array}{rcl}
+                    x, & & if \ x >= 0 \\
+                    negative\_slope * x, & & otherwise \\
+                \end{array}
+            \right.
+
 
     Parameters:
         negative_slope (float, optional): Slope of the activation function at
@@ -604,13 +612,14 @@ class Hardsigmoid(Layer):
     .. math::
 
         Hardsigmoid(x)=
-            \\left\\{
-            \\begin{aligned}
-            &0, & & \\text{if } x \\leq -3 \\\\
-            &1, & & \\text{if } x \\geq 3 \\\\
-            &x/6 + 1/2, & & \\text{otherwise}
-            \\end{aligned}
-            \\right.
+            \left\{
+                \begin{array}{rcl}
+            0, & & \text{if } \ x \leq -3 \\
+            1, & & \text{if } \ x \geq 3 \\
+            x/6 + 1/2, & & \text{otherwise}
+                \end{array}
+            \right.
+
 
     Parameters:
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
@@ -650,8 +659,8 @@ class Softplus(Layer):
 
     .. math::
 
-        Softplus(x) = \\frac{1}{beta} * \\log(1 + e^{beta * x}) \\\\
-        \\text{For numerical stability, the implementation reverts to the linear function when: beta * x > threshold.}
+        Softplus(x) = \frac{1}{beta} * \log(1 + e^{beta * x}) \\
+        \text{For numerical stability, the implementation reverts to the linear function when: beta * x > threshold.}
 
     Parameters:
         beta (float, optional): The value of beta for Softplus. Default is 1
@@ -695,11 +704,15 @@ class Softshrink(Layer):
 
     .. math::
 
-        Softshrink(x)= \\begin{cases}
-                        x - threshold, \\text{if } x > threshold \\\\
-                        x + threshold, \\text{if } x < -threshold \\\\
-                        0,  \\text{otherwise}
-                      \\end{cases}
+        Softshrink(x)=
+            \left\{
+                \begin{array}{rcl}
+                x - threshold,& & \text{if } x > threshold \\
+                x + threshold,& & \text{if } x < -threshold \\
+                0,& &  \text{otherwise}
+            \end{array}
+            \right.
+
 
     Parameters:
         threshold (float, optional): The value of threshold(must be no less than zero) for softplus. Default is 0.5
@@ -740,7 +753,7 @@ class Softsign(Layer):
 
     .. math::
 
-        Softsign(x) = \\frac{x}{1 + |x|}
+        Softsign(x) = \frac{x}{1 + |x|}
 
     Parameters:
         name (str, optional): Name for the operation (optional, default is None).
@@ -779,7 +792,7 @@ class Swish(Layer):
 
     .. math::
 
-        Swish(x) = \\frac{x}{1 + e^{-x}}
+        Swish(x) = \frac{x}{1 + e^{-x}}
 
     Parameters:
         name (str, optional): Name for the operation (optional, default is None).
@@ -857,10 +870,14 @@ class ThresholdedReLU(Layer):
 
     .. math::
 
-        ThresholdedReLU(x) = \\begin{cases}
-                               x, \\text{if } x > threshold \\\\
-                               0, \\text{otherwise}
-                              \\end{cases}
+        ThresholdedReLU(x) =
+            \left\{
+                \begin{array}{rl}
+                x,& \text{if } \ x > threshold \\
+                0,& \text{otherwise}
+                \end{array}
+            \right.
+
 
     Parameters:
         threshold (float, optional): The value of threshold for ThresholdedReLU. Default is 1.0
@@ -939,7 +956,7 @@ class LogSigmoid(Layer):
 
     .. math::
 
-        LogSigmoid(x) = log \\frac{1}{1 + e^{-x}}
+        LogSigmoid(x) = log \frac{1}{1 + e^{-x}}
 
     Parameters:
         x (Tensor): The input Tensor with data type float32, or float64.
@@ -1001,7 +1018,7 @@ class Softmax(Layer):
 
     .. math::
 
-        Softmax[i, j] = \\frac{\\exp(x[i, j])}{\\sum_j(exp(x[i, j])}
+        Softmax[i, j] = \frac{\exp(x[i, j])}{\sum_j(exp(x[i, j])}
 
     Example:
 
@@ -1105,10 +1122,10 @@ class LogSoftmax(Layer):
 
     .. math::
 
-        \\begin{aligned} 
-        Out[i, j] &= log(softmax(x)) \\\\
-        &= log(\\frac{\\exp(X[i, j])}{\\sum_j(\\exp(X[i, j])})
-        \\end{aligned}
+        \begin{array} {rcl}
+            Out[i, j] &= &log(softmax(x)) \\
+            &= &log(\frac{\exp(X[i, j])}{\sum_j(\exp(X[i, j])})
+        \end{array}
 
     Parameters:
         axis (int, optional): The axis along which to perform log_softmax
@@ -1167,12 +1184,14 @@ class Maxout(Layer):
 
     .. math::
 
-        &out_{si+j} = \max_{k} x_{gsi + sk + j} \\\\
-        &g = groups \\\\
-        &s = \\frac{input.size}{num\\_channels} \\\\
-        &0 \\le i < \\frac{num\\_channels}{groups} \\\\
-        &0 \\le j < s \\\\
-        &0 \\le k < groups
+        \begin{array}{l}
+            &out_{si+j} = \max_{k} x_{gsi + sk + j} \\
+            &g = groups \\
+            &s = \frac{input.size}{num\_channels} \\
+            &0 \le i < \frac{num\_channels}{groups} \\
+            &0 \le j < s \\
+            &0 \le k < groups
+        \end{array}
 
     Parameters:
         groups (int, optional): The groups number of maxout. `groups` specifies the
diff --git a/python/paddle/nn/layer/loss.py b/python/paddle/nn/layer/loss.py
index 31b552bed162c2..3ac0d675fb72c6 100644
--- a/python/paddle/nn/layer/loss.py
+++ b/python/paddle/nn/layer/loss.py
@@ -40,18 +40,18 @@ class BCEWithLogitsLoss(Layer):
     First this operator calculate loss function as follows:
 
     .. math::
-           Out = -Labels * \\log(\\sigma(Logit)) - (1 - Labels) * \\log(1 - \\sigma(Logit))
+           Out = -Labels * \log(\sigma(Logit)) - (1 - Labels) * \log(1 - \sigma(Logit))
 
-    We know that :math:`\\sigma(Logit) = \\frac{1}{1 + \\e^{-Logit}}`. By substituting this we get:
+    We know that :math:`\sigma(Logit) = \frac{1}{1 + e^{-Logit}}`. By substituting this we get:
 
     .. math::
-           Out = Logit - Logit * Labels + \\log(1 + \\e^{-Logit})
+           Out = Logit - Logit * Labels + \log(1 + e^{-Logit})
 
-    For stability and to prevent overflow of :math:`\\e^{-Logit}` when Logit < 0,
+    For stability and to prevent overflow of :math:`e^{-Logit}` when Logit < 0,
     we reformulate the loss as follows:
 
     .. math::
-           Out = \\max(Logit, 0) - Logit * Labels + \\log(1 + \\e^{-\|Logit\|})
+           Out = \max(Logit, 0) - Logit * Labels + \log(1 + e^{-\|Logit\|})
 
     Then, if ``weight`` or ``pos_weight`` is not None, this operator multiply the
     weight tensor on the loss `Out`. The ``weight`` tensor will attach different
@@ -779,8 +779,6 @@ def forward(self, input, label):
 
 class NLLLoss(Layer):
     r"""
-	:alias_main: paddle.nn.NLLLoss
-	:alias: paddle.nn.NLLLoss,paddle.nn.layer.NLLLoss,paddle.nn.layer.loss.NLLLoss
 
     This class accepts input and target label and returns negative log likelihood
     cross error. It is useful to train a classification problem with C classes.
@@ -800,20 +798,25 @@ class NLLLoss(Layer):
     The unreduced (i.e. with :attr:`reduction` set to ``'none'``) loss can be described as:
 
     .. math::
-        \ell(x, y) = L = \{l_1,\dots,l_N\}^\\top, \quad
+
+        \ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad
         l_n = - w_{y_n} x_{n,y_n}, \quad
-        w_{c} = \\text{weight}[c] \cdot \mathbb{1}\{c \\not= \\text{ignore\\_index}\},
+        w_{c} = \text{weight}[c] \cdot \mathbb{1}\{c \not= \text{ignore\_index}\},
 
     where :math:`N` is the batch size. If :attr:`reduction` is not ``'none'``
     (default ``'mean'``), then
 
     .. math::
-        \ell(x, y) = \\begin{cases}
-            \\sum_{n=1}^N \\frac{1}{\\sum_{n=1}^N w_{y_n}} l_n, &
-            \\text{if reduction} = \\text{'mean';}\\\\
-            \\sum_{n=1}^N l_n,  &
-            \\text{if reduction} = \\text{'sum'.}
-        \\end{cases}
+
+        \ell(x, y) =
+        \left\{
+            \begin{array}{lcl}
+            \sum_{n=1}^N \frac{1}{\sum_{n=1}^N w_{y_n}} l_n, &
+            \text{if  reduction} = \text{'mean';}\\
+            \sum_{n=1}^N l_n,  &
+            \text{if  reduction} = \text{'sum'.}
+            \end{array}
+        \right.
 
     Parameters:
         weight (Tensor, optional): Weight tensor, a manual rescaling weight given
@@ -1136,16 +1139,16 @@ class SmoothL1Loss(Layer):
 
     .. math::
 
-         loss(x,y) = \\frac{1}{n}\\sum_{i}z_i
+         loss(x,y) = \frac{1}{n}\sum_{i}z_i
 
     where z_i is given by:
 
     .. math::
 
-         \\mathop{z_i} = \\left\\{\\begin{array}{rcl}
-        0.5(x_i - y_i)^2 & & {if |x_i - y_i| < delta} \\\\
+        \mathop{z_i} = \left\{\begin{array}{rcl}
+        0.5(x_i - y_i)^2 & & {if |x_i - y_i| < delta} \\
         delta * |x_i - y_i| - 0.5 * delta^2 & & {otherwise}
-        \\end{array} \\right.
+        \end{array} \right.
 
     Parameters:
         reduction (str, optional): Indicate how to average the loss by batch_size,
diff --git a/python/paddle/nn/layer/norm.py b/python/paddle/nn/layer/norm.py
index 9abbc494258948..41599809810ee7 100644
--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
@@ -115,13 +115,13 @@ class InstanceNorm1D(_InstanceNormBase):
 
     ..  math::
         
-        \\mu_{\\beta} &\\gets \\frac{1}{HW} \\sum_{i=1}^{HW} x_i \\qquad &//\\
-        \\ mean\ of\ one\  feature\ map\ in\ mini-batch \\\\
-        \\sigma_{\\beta}^{2} &\\gets \\frac{1}{HW} \\sum_{i=1}^{HW}(x_i - \\
-        \\mu_{\\beta})^2 \\qquad &//\ variance\ of\ one\ feature\ map\ in\ mini-batch \\\\
-        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
-        \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\
-        y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
+        \mu_{\beta} &\gets \frac{1}{HW} \sum_{i=1}^{HW} x_i \qquad &//\
+        \ mean\ of\ one\  feature\ map\ in\ mini-batch \\
+        \sigma_{\beta}^{2} &\gets \frac{1}{HW} \sum_{i=1}^{HW}(x_i - \
+        \mu_{\beta})^2 \qquad &//\ variance\ of\ one\ feature\ map\ in\ mini-batch \\
+        \hat{x_i} &\gets \frac{x_i - \mu_\beta} {\sqrt{\
+        \sigma_{\beta}^{2} + \epsilon}} \qquad &//\ normalize \\
+        y_i &\gets \gamma \hat{x_i} + \beta \qquad &//\ scale\ and\ shift
 
     Note:
         `H` means height of feature map, `W` means width of feature map.
@@ -187,13 +187,13 @@ class InstanceNorm2D(_InstanceNormBase):
 
     ..  math::
         
-        \\mu_{\\beta} &\\gets \\frac{1}{HW} \\sum_{i=1}^{HW} x_i \\qquad &//\\
-        \\ mean\ of\ one\  feature\ map\ in\ mini-batch \\\\
-        \\sigma_{\\beta}^{2} &\\gets \\frac{1}{HW} \\sum_{i=1}^{HW}(x_i - \\
-        \\mu_{\\beta})^2 \\qquad &//\ variance\ of\ one\ feature\ map\ in\ mini-batch \\\\
-        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
-        \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\
-        y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
+        \mu_{\beta} &\gets \frac{1}{HW} \sum_{i=1}^{HW} x_i \qquad &//\
+        \ mean\ of\ one\  feature\ map\ in\ mini-batch \\
+        \sigma_{\beta}^{2} &\gets \frac{1}{HW} \sum_{i=1}^{HW}(x_i - \
+        \mu_{\beta})^2 \qquad &//\ variance\ of\ one\ feature\ map\ in\ mini-batch \\
+        \hat{x_i} &\gets \frac{x_i - \mu_\beta} {\sqrt{\
+        \sigma_{\beta}^{2} + \epsilon}} \qquad &//\ normalize \\
+        y_i &\gets \gamma \hat{x_i} + \beta \qquad &//\ scale\ and\ shift
 
     Note:
         `H` means height of feature map, `W` means width of feature map.
@@ -257,13 +257,13 @@ class InstanceNorm3D(_InstanceNormBase):
 
     ..  math::
         
-        \\mu_{\\beta} &\\gets \\frac{1}{HW} \\sum_{i=1}^{HW} x_i \\qquad &//\\
-        \\ mean\ of\ one\  feature\ map\ in\ mini-batch \\\\
-        \\sigma_{\\beta}^{2} &\\gets \\frac{1}{HW} \\sum_{i=1}^{HW}(x_i - \\
-        \\mu_{\\beta})^2 \\qquad &//\ variance\ of\ one\ feature\ map\ in\ mini-batch \\\\
-        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
-        \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\
-        y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
+        \mu_{\beta} &\gets \frac{1}{HW} \sum_{i=1}^{HW} x_i \qquad &//\
+        \ mean\ of\ one\  feature\ map\ in\ mini-batch \\
+        \sigma_{\beta}^{2} &\gets \frac{1}{HW} \sum_{i=1}^{HW}(x_i - \
+        \mu_{\beta})^2 \qquad &//\ variance\ of\ one\ feature\ map\ in\ mini-batch \\
+        \hat{x_i} &\gets \frac{x_i - \mu_\beta} {\sqrt{\
+        \sigma_{\beta}^{2} + \epsilon}} \qquad &//\ normalize \\
+        y_i &\gets \gamma \hat{x_i} + \beta \qquad &//\ scale\ and\ shift
 
     Note:
         `H` means height of feature map, `W` means width of feature map.
@@ -450,15 +450,15 @@ class LayerNorm(Layer):
 
     ..  math::
 
-        \\mu & = \\frac{1}{H}\\sum_{i=1}^{H} x_i
+        \mu & = \frac{1}{H}\sum_{i=1}^{H} x_i
 
-        \\sigma & = \\sqrt{\\frac{1}{H}\sum_{i=1}^{H}{(x_i - \\mu)^2} + \\epsilon}
+        \sigma & = \sqrt{\frac{1}{H}\sum_{i=1}^{H}{(x_i - \mu)^2} + \epsilon}
 
-        y & = f(\\frac{g}{\\sigma}(x - \\mu) + b)
+        y & = f(\frac{g}{\sigma}(x - \mu) + b)
 
     - :math:`x`: the vector representation of the summed inputs to the neurons in that layer.
     - :math:`H`: the number of hidden units in a layers
-    - :math:`\\epsilon`: the small value added to the variance to prevent division by zero.
+    - :math:`\epsilon`: the small value added to the variance to prevent division by zero.
     - :math:`g`: the trainable scale parameter.
     - :math:`b`: the trainable bias parameter.
 
@@ -666,37 +666,36 @@ class BatchNorm1D(_BatchNormBase):
     r"""
     Applies Batch Normalization over a 2D or 3D input (a mini-batch of 1D inputswith additional channel dimension) as described in the paper Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift .
 
-    When use_global_stats = False, the :math:`\\mu_{\\beta}`
-    and :math:`\\sigma_{\\beta}^{2}` are the statistics of one mini-batch.
+    When use_global_stats = False, the :math:`\mu_{\beta}`
+    and :math:`\sigma_{\beta}^{2}` are the statistics of one mini-batch.
     Calculated as follows:
 
     ..  math::
 
-        \\mu_{\\beta} &\\gets \\frac{1}{m} \\sum_{i=1}^{m} x_i \\qquad &//\\
-        \ mini-batch\ mean \\\\
-        \\sigma_{\\beta}^{2} &\\gets \\frac{1}{m} \\sum_{i=1}^{m}(x_i - \\
-        \\mu_{\\beta})^2 \\qquad &//\ mini-batch\ variance \\\\
+        \mu_{\beta} &\gets \frac{1}{m} \sum_{i=1}^{m} x_i \qquad &//\
+        \ mini-batch\ mean \\
+        \sigma_{\beta}^{2} &\gets \frac{1}{m} \sum_{i=1}^{m}(x_i - \
+        \mu_{\beta})^2 \qquad &//\ mini-batch\ variance \\
 
-    When use_global_stats = True, the :math:`\\mu_{\\beta}`
-    and :math:`\\sigma_{\\beta}^{2}` are not the statistics of one mini-batch.
+    When use_global_stats = True, the :math:`\mu_{\beta}`
+    and :math:`\sigma_{\beta}^{2}` are not the statistics of one mini-batch.
     They are global or running statistics (moving_mean and moving_variance). It usually got from the
     pre-trained model. Calculated as follows:
 
     .. math::
-        moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global mean \\
-        moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global variance \\
+        moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global \ mean \\
+        moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global \ variance \\
 
     The normalization function formula is as follows:
 
     ..  math::
 
-        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
-        \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\
-        y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
+        \hat{x_i} &\gets \frac{x_i - \mu_\beta} {\sqrt{\sigma_{\beta}^{2} + \epsilon}} \qquad &//\ normalize \\
+        y_i &\gets \gamma \hat{x_i} + \beta \qquad &//\ scale\ and\ shift
 
-    - :math:`\\epsilon` : add a smaller value to the variance to prevent division by zero
-    - :math:`\\gamma` : trainable proportional parameter
-    - :math:`\\beta` : trainable deviation parameter
+    - :math:`\epsilon` : add a smaller value to the variance to prevent division by zero
+    - :math:`\gamma` : trainable proportional parameter
+    - :math:`\beta` : trainable deviation parameter
 
     Parameters:
         num_features(int): Indicate the number of channels of the input ``Tensor``.
@@ -770,37 +769,36 @@ class BatchNorm2D(_BatchNormBase):
     r"""
     Applies Batch Normalization over a 4D input (a mini-batch of 2D inputswith additional channel dimension) as described in the paper Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift .
 
-    When use_global_stats = False, the :math:`\\mu_{\\beta}`
-    and :math:`\\sigma_{\\beta}^{2}` are the statistics of one mini-batch.
+    When use_global_stats = False, the :math:`\mu_{\beta}`
+    and :math:`\sigma_{\beta}^{2}` are the statistics of one mini-batch.
     Calculated as follows:
 
     ..  math::
 
-        \\mu_{\\beta} &\\gets \\frac{1}{m} \\sum_{i=1}^{m} x_i \\qquad &//\\
-        \ mini-batch\ mean \\\\
-        \\sigma_{\\beta}^{2} &\\gets \\frac{1}{m} \\sum_{i=1}^{m}(x_i - \\
-        \\mu_{\\beta})^2 \\qquad &//\ mini-batch\ variance \\\\
+        \mu_{\beta} &\gets \frac{1}{m} \sum_{i=1}^{m} x_i \qquad &//
+        \ mini-batch\ mean \\
+        \sigma_{\beta}^{2} &\gets \frac{1}{m} \sum_{i=1}^{m}(x_i - 
+        \mu_{\beta})^2 \qquad &//\ mini-batch\ variance \\
 
-    When use_global_stats = True, the :math:`\\mu_{\\beta}`
-    and :math:`\\sigma_{\\beta}^{2}` are not the statistics of one mini-batch.
+    When use_global_stats = True, the :math:`\mu_{\beta}`
+    and :math:`\sigma_{\beta}^{2}` are not the statistics of one mini-batch.
     They are global or running statistics (moving_mean and moving_variance). It usually got from the
     pre-trained model. Calculated as follows:
 
     .. math::
-        moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global mean \\
-        moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global variance \\
+        moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global \ mean \\
+        moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global \ variance \\
 
     The normalization function formula is as follows:
 
     ..  math::
 
-        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
-        \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\
-        y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
+        \hat{x_i} &\gets \frac{x_i - \mu_\beta} {\sqrt{\sigma_{\beta}^{2} + \epsilon}} \qquad &//\ normalize \\
+        y_i &\gets \gamma \hat{x_i} + \beta \qquad &//\ scale\ and\ shift
 
-    - :math:`\\epsilon` : add a smaller value to the variance to prevent division by zero
-    - :math:`\\gamma` : trainable proportional parameter
-    - :math:`\\beta` : trainable deviation parameter
+    - :math:`\epsilon` : add a smaller value to the variance to prevent division by zero
+    - :math:`\gamma` : trainable proportional parameter
+    - :math:`\beta` : trainable deviation parameter
 
     Parameters:
         num_features(int): Indicate the number of channels of the input ``Tensor``.
@@ -859,16 +857,16 @@ class BatchNorm3D(_BatchNormBase):
     r"""
     Applies Batch Normalization over a 5D input (a mini-batch of 3D inputswith additional channel dimension) as described in the paper Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift .
 
-    When use_global_stats = False, the :math:`\\mu_{\\beta}`
-    and :math:`\\sigma_{\\beta}^{2}` are the statistics of one mini-batch.
+    When use_global_stats = False, the :math:`\mu_{\beta}`
+    and :math:`\sigma_{\beta}^{2}` are the statistics of one mini-batch.
     Calculated as follows:
 
     ..  math::
 
-        \\mu_{\\beta} &\\gets \\frac{1}{m} \\sum_{i=1}^{m} x_i \\qquad &//\\
-        \ mini-batch\ mean \\\\
-        \\sigma_{\\beta}^{2} &\\gets \\frac{1}{m} \\sum_{i=1}^{m}(x_i - \\
-        \\mu_{\\beta})^2 \\qquad &//\ mini-batch\ variance \\\\
+        \mu_{\beta} &\gets \frac{1}{m} \sum_{i=1}^{m} x_i \qquad &//\
+        \ mini-batch\ mean \\
+        \sigma_{\beta}^{2} &\gets \frac{1}{m} \sum_{i=1}^{m}(x_i - \
+        \mu_{\beta})^2 \qquad &//\ mini-batch\ variance \\
 
     When use_global_stats = True, the :math:`\\mu_{\\beta}`
     and :math:`\\sigma_{\\beta}^{2}` are not the statistics of one mini-batch.
@@ -876,20 +874,19 @@ class BatchNorm3D(_BatchNormBase):
     pre-trained model. Calculated as follows:
 
     .. math::
-        moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global mean \\
-        moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global variance \\
+        moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global \ mean \\
+        moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global \ variance \\
 
     The normalization function formula is as follows:
 
     ..  math::
 
-        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
-        \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\
-        y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
+        \hat{x_i} &\gets \frac{x_i - \mu_\beta} {\sqrt{\sigma_{\beta}^{2} + \epsilon}} \qquad &//\ normalize \\
+        y_i &\gets \gamma \hat{x_i} + \beta \qquad &//\ scale\ and\ shift
 
-    - :math:`\\epsilon` : add a smaller value to the variance to prevent division by zero
-    - :math:`\\gamma` : trainable proportional parameter
-    - :math:`\\beta` : trainable deviation parameter
+    - :math:`\epsilon` : add a smaller value to the variance to prevent division by zero
+    - :math:`\gamma` : trainable proportional parameter
+    - :math:`\beta` : trainable deviation parameter
 
     Parameters:
         num_features(int): Indicate the number of channels of the input ``Tensor``.
@@ -976,33 +973,33 @@ class SyncBatchNorm(_BatchNormBase):
 
     ..  math::
 
-        \\mu_{\\beta} &\\gets \\frac{1}{m} \\sum_{i=1}^{m} x_i \\qquad &//\\
-        \ mini-batch\ mean \\\\
-        \\sigma_{\\beta}^{2} &\\gets \\frac{1}{m} \\sum_{i=1}^{m}(x_i - \\
-        \\mu_{\\beta})^2 \\qquad &//\ mini-batch\ variance \\\\
+        \mu_{\beta} &\gets \frac{1}{m} \sum_{i=1}^{m} x_i \qquad &//\
+        \ mini-batch\ mean \\
+        \sigma_{\beta}^{2} &\gets \frac{1}{m} \sum_{i=1}^{m}(x_i - \
+        \mu_{\beta})^2 \qquad &//\ mini-batch\ variance \\
 
     - :math:`x` : whole mini-batch data in all gpus
     - :math:`m` : the size of the whole mini-batch data
 
     When model in evaluation mode, the :math:`\\mu_{\\beta}`
-    and :math:`\\sigma_{\\beta}^{2}` are global statistics (moving_mean and moving_variance, 
+    and :math:`\sigma_{\beta}^{2}` are global statistics (moving_mean and moving_variance, 
     which usually got from the pre-trained model). Global statistics calculated as follows:
 
     .. math::
-        moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global mean \\
-        moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global variance \\
+        moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global \ mean \\
+        moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global \ variance \\
 
     The formula of normalization is as follows:
  
     ..  math::
 
-        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
-        \\sigma_{\\beta}^{2} + \\eps}} \\qquad &//\ normalize \\\\
-        y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
+        \hat{x_i} &\gets \frac{x_i - \mu_\beta} {\sqrt{\
+        \sigma_{\beta}^{2} + \epsilon}} \qquad &//\ normalize \\
+        y_i &\gets \gamma \hat{x_i} + \beta \qquad &//\ scale\ and\ shift
 
-    - :math:`\\eps` : add a smaller value to the variance to prevent division by zero
-    - :math:`\\gamma` : trainable scale parameter vector
-    - :math:`\\beta` : trainable shift parameter vector 
+    - :math:`\epsilon` : add a smaller value to the variance to prevent division by zero
+    - :math:`\gamma` : trainable scale parameter vector
+    - :math:`\beta` : trainable shift parameter vector 
 
     Note:
         If you want to use container to pack your model and has ``SyncBatchNorm`` in the 
diff --git a/python/paddle/nn/layer/rnn.py b/python/paddle/nn/layer/rnn.py
index 77168566d88c60..fbb648af42a337 100644
--- a/python/paddle/nn/layer/rnn.py
+++ b/python/paddle/nn/layer/rnn.py
@@ -332,6 +332,10 @@ def __init__(self,
                  bias_hh_attr=None,
                  name=None):
         super(SimpleRNNCell, self).__init__()
+        if hidden_size <= 0:
+            raise ValueError(
+                "hidden_size of {} must be greater than 0, but now equals to {}".
+                format(self.__class__.__name__, hidden_size))
         std = 1.0 / math.sqrt(hidden_size)
         self.weight_ih = self.create_parameter(
             (hidden_size, input_size),
@@ -480,6 +484,10 @@ def __init__(self,
                  bias_hh_attr=None,
                  name=None):
         super(LSTMCell, self).__init__()
+        if hidden_size <= 0:
+            raise ValueError(
+                "hidden_size of {} must be greater than 0, but now equals to {}".
+                format(self.__class__.__name__, hidden_size))
         std = 1.0 / math.sqrt(hidden_size)
         self.weight_ih = self.create_parameter(
             (4 * hidden_size, input_size),
@@ -627,6 +635,10 @@ def __init__(self,
                  bias_hh_attr=None,
                  name=None):
         super(GRUCell, self).__init__()
+        if hidden_size <= 0:
+            raise ValueError(
+                "hidden_size of {} must be greater than 0, but now equals to {}".
+                format(self.__class__.__name__, hidden_size))
         std = 1.0 / math.sqrt(hidden_size)
         self.weight_ih = self.create_parameter(
             (3 * hidden_size, input_size),
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index 01be63c5dfed48..394d46b9161903 100755
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -716,13 +716,15 @@ def sum(x, axis=None, dtype=None, keepdim=False, name=None):
         else:
             reduce_all_flag = False
 
-    dtype_flag = False
-    if dtype is not None:
-        if dtype in ['float64', 'int64']:
-            if (convert_dtype(x.dtype) == "float32" and dtype == "float64") or \
-               (convert_dtype(x.dtype) == "int32" and dtype == "int64"):
-                dtype_flag = True
-
+    def get_dtype(x, dtype):
+        if dtype is not None:
+            return (True, dtype)
+        src_type = convert_dtype(x.dtype)
+        if src_type in ['bool','int32', 'int64']:
+            return (True, 'int64')
+        return (False, src_type)
+
+    dtype_flag, dtype = get_dtype(x, dtype)
     if in_dygraph_mode():
         axis = axis if axis != None and axis != [] else [0]
         if dtype_flag:
@@ -740,27 +742,17 @@ def sum(x, axis=None, dtype=None, keepdim=False, name=None):
         'reduce_all': reduce_all_flag
     }
 
-    if dtype is not None:
-        if dtype in ['float64', 'int64']:
-            if (convert_dtype(x.dtype) == "float32" and dtype == "float64") or \
-               (convert_dtype(x.dtype) == "int32" and dtype == "int64"):
-                attrs.update({
-                    'in_dtype': x.dtype,
-                    'out_dtype': convert_np_dtype_to_dtype_(dtype)
-                })
+    if dtype_flag:
+        attrs.update({
+            'in_dtype': x.dtype,
+            'out_dtype': convert_np_dtype_to_dtype_(dtype)
+        })
 
     check_variable_and_dtype(
-        x, 'x', ['float32', 'float64', 'int32', 'int64'], 'sum')
-
-    if dtype is not None:
-        check_dtype(dtype, 'dtype', ['float32', 'float64', 'int32', 'int64'], 'sum')
-        x_dtype = convert_dtype(x.dtype)
-
-        if (x_dtype == "float64" and dtype in ["float32", "int32"]) or \
-                (x_dtype == "int64" and dtype == "int32"):
-            raise ValueError("The input(x)'s dtype is {} but the attr(dtype) of sum is {}, "
-                             "which may cause data type overflows. Please reset attr(dtype) of sum."
-                             .format(x_dtype, dtype))
+        x, 'x', ['bool', 'float16', 'float32', 'float64',
+                'int32', 'int64', 'complex64', 'complex128',
+                u'bool', u'float16', u'float32', u'float64',
+                u'int32', u'int64', u'complex64', u'complex128'], 'sum')
 
     check_type(axis, 'axis', (int, list, tuple, type(None)), 'sum')
 
diff --git a/python/paddle/tests/test_model.py b/python/paddle/tests/test_model.py
index abeb83391751be..f90ff0c99af959 100644
--- a/python/paddle/tests/test_model.py
+++ b/python/paddle/tests/test_model.py
@@ -169,7 +169,7 @@ class TestModel(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
         if not fluid.is_compiled_with_cuda():
-            cls.skipTest('module not tested when ONLY_CPU compling')
+            cls().skipTest('module not tested when ONLY_CPU compling')
         cls.device = paddle.set_device('gpu')
         fluid.enable_dygraph(cls.device)
 
diff --git a/python/paddle/utils/__init__.py b/python/paddle/utils/__init__.py
index c23841ea8b802b..2c7bca71698d44 100644
--- a/python/paddle/utils/__init__.py
+++ b/python/paddle/utils/__init__.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from . import gast
 from .profiler import ProfilerOptions  # noqa: F401
 from .profiler import Profiler  # noqa: F401
 from .profiler import get_profiler  # noqa: F401
diff --git a/python/paddle/utils/gast/__init__.py b/python/paddle/utils/gast/__init__.py
new file mode 100644
index 00000000000000..0bcbf5abb81b26
--- /dev/null
+++ b/python/paddle/utils/gast/__init__.py
@@ -0,0 +1,33 @@
+# Copyright (c) 2016, Serge Guelton
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 	Redistributions of source code must retain the above copyright notice, this
+# 	list of conditions and the following disclaimer.
+
+# 	Redistributions in binary form must reproduce the above copyright notice,
+# 	this list of conditions and the following disclaimer in the documentation
+# 	and/or other materials provided with the distribution.
+
+# 	Neither the name of HPCProject, Serge Guelton nor the names of its
+# 	contributors may be used to endorse or promote products derived from this
+# 	software without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# NOTE(paddle-dev): We introduce third-party library Gast as unified AST
+# representation. See https://github.com/serge-sans-paille/gast for details.
+
+from .gast import *
+from ast import NodeVisitor, NodeTransformer, iter_fields, dump
diff --git a/python/paddle/utils/gast/ast3.py b/python/paddle/utils/gast/ast3.py
new file mode 100644
index 00000000000000..58840d5c29074c
--- /dev/null
+++ b/python/paddle/utils/gast/ast3.py
@@ -0,0 +1,449 @@
+# Copyright (c) 2016, Serge Guelton
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 	Redistributions of source code must retain the above copyright notice, this
+# 	list of conditions and the following disclaimer.
+
+# 	Redistributions in binary form must reproduce the above copyright notice,
+# 	this list of conditions and the following disclaimer in the documentation
+# 	and/or other materials provided with the distribution.
+
+# 	Neither the name of HPCProject, Serge Guelton nor the names of its
+# 	contributors may be used to endorse or promote products derived from this
+# 	software without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# NOTE(paddle-dev): We introduce third-party library Gast as unified AST
+# representation. See https://github.com/serge-sans-paille/gast for details.
+
+from .astn import AstToGAst, GAstToAst
+from . import gast
+import ast
+import sys
+
+
+class Ast3ToGAst(AstToGAst):
+    if sys.version_info.minor < 9:
+
+        def visit_ExtSlice(self, node):
+            new_node = gast.Tuple(self._visit(node.dims), gast.Load())
+            gast.copy_location(new_node, node)
+            return new_node
+
+        def visit_Index(self, node):
+            return self._visit(node.value)
+
+    if sys.version_info.minor < 8:
+
+        def visit_Module(self, node):
+            new_node = gast.Module(
+                self._visit(node.body),
+                []  # type_ignores
+            )
+            return new_node
+
+        def visit_Num(self, node):
+            new_node = gast.Constant(
+                node.n,
+                None, )
+            gast.copy_location(new_node, node)
+            return new_node
+
+        def visit_Ellipsis(self, node):
+            new_node = gast.Constant(
+                Ellipsis,
+                None, )
+            gast.copy_location(new_node, node)
+            new_node.end_lineno = new_node.end_col_offset = None
+            return new_node
+
+        def visit_Str(self, node):
+            new_node = gast.Constant(
+                node.s,
+                None, )
+            gast.copy_location(new_node, node)
+            return new_node
+
+        def visit_Bytes(self, node):
+            new_node = gast.Constant(
+                node.s,
+                None, )
+            gast.copy_location(new_node, node)
+            return new_node
+
+        def visit_FunctionDef(self, node):
+            new_node = gast.FunctionDef(
+                self._visit(node.name),
+                self._visit(node.args),
+                self._visit(node.body),
+                self._visit(node.decorator_list),
+                self._visit(node.returns),
+                None,  # type_comment
+            )
+            gast.copy_location(new_node, node)
+            return new_node
+
+        def visit_AsyncFunctionDef(self, node):
+            new_node = gast.AsyncFunctionDef(
+                self._visit(node.name),
+                self._visit(node.args),
+                self._visit(node.body),
+                self._visit(node.decorator_list),
+                self._visit(node.returns),
+                None,  # type_comment
+            )
+            gast.copy_location(new_node, node)
+            return new_node
+
+        def visit_For(self, node):
+            new_node = gast.For(
+                self._visit(node.target),
+                self._visit(node.iter),
+                self._visit(node.body),
+                self._visit(node.orelse),
+                None,  # type_comment
+            )
+            gast.copy_location(new_node, node)
+            return new_node
+
+        def visit_AsyncFor(self, node):
+            new_node = gast.AsyncFor(
+                self._visit(node.target),
+                self._visit(node.iter),
+                self._visit(node.body),
+                self._visit(node.orelse),
+                None,  # type_comment
+            )
+            gast.copy_location(new_node, node)
+            return new_node
+
+        def visit_With(self, node):
+            new_node = gast.With(
+                self._visit(node.items),
+                self._visit(node.body),
+                None,  # type_comment
+            )
+            gast.copy_location(new_node, node)
+            return new_node
+
+        def visit_AsyncWith(self, node):
+            new_node = gast.AsyncWith(
+                self._visit(node.items),
+                self._visit(node.body),
+                None,  # type_comment
+            )
+            gast.copy_location(new_node, node)
+            return new_node
+
+        def visit_Call(self, node):
+            if sys.version_info.minor < 5:
+                if node.starargs:
+                    star = gast.Starred(self._visit(node.starargs), gast.Load())
+                    gast.copy_location(star, node)
+                    starred = [star]
+                else:
+                    starred = []
+
+                if node.kwargs:
+                    kw = gast.keyword(None, self._visit(node.kwargs))
+                    gast.copy_location(kw, node.kwargs)
+                    kwargs = [kw]
+                else:
+                    kwargs = []
+            else:
+                starred = kwargs = []
+
+            new_node = gast.Call(
+                self._visit(node.func),
+                self._visit(node.args) + starred,
+                self._visit(node.keywords) + kwargs, )
+            gast.copy_location(new_node, node)
+            return new_node
+
+        def visit_NameConstant(self, node):
+            if node.value is None:
+                new_node = gast.Constant(None, None)
+            elif node.value is True:
+                new_node = gast.Constant(True, None)
+            elif node.value is False:
+                new_node = gast.Constant(False, None)
+            gast.copy_location(new_node, node)
+            return new_node
+
+        def visit_arguments(self, node):
+            new_node = gast.arguments(
+                self._visit(node.args),
+                [],  # posonlyargs
+                self._visit(node.vararg),
+                self._visit(node.kwonlyargs),
+                self._visit(node.kw_defaults),
+                self._visit(node.kwarg),
+                self._visit(node.defaults), )
+            gast.copy_location(new_node, node)
+            return new_node
+
+    def visit_Name(self, node):
+        new_node = gast.Name(
+            self._visit(node.id),
+            self._visit(node.ctx),
+            None,
+            None, )
+        ast.copy_location(new_node, node)
+        return new_node
+
+    def visit_arg(self, node):
+        if sys.version_info.minor < 8:
+            extra_args = [None]
+        else:
+            extra_args = [self._visit(node.type_comment)]
+
+        new_node = gast.Name(
+            self._visit(node.arg),
+            gast.Param(),
+            self._visit(node.annotation),
+            *extra_args  # type_comment
+        )
+        ast.copy_location(new_node, node)
+        return new_node
+
+    def visit_ExceptHandler(self, node):
+        if node.name:
+            new_node = gast.ExceptHandler(
+                self._visit(node.type),
+                gast.Name(node.name, gast.Store(), None, None),
+                self._visit(node.body))
+            ast.copy_location(new_node, node)
+            return new_node
+        else:
+            return self.generic_visit(node)
+
+    if sys.version_info.minor < 6:
+
+        def visit_comprehension(self, node):
+            new_node = gast.comprehension(
+                target=self._visit(node.target),
+                iter=self._visit(node.iter),
+                ifs=self._visit(node.ifs),
+                is_async=0, )
+            return ast.copy_location(new_node, node)
+
+
+class GAstToAst3(GAstToAst):
+    if sys.version_info.minor < 9:
+
+        def visit_Subscript(self, node):
+            def adjust_slice(s):
+                if isinstance(s, ast.Slice):
+                    return s
+                else:
+                    return ast.Index(s)
+
+            if isinstance(node.slice, gast.Tuple):
+                if any(isinstance(elt, gast.slice) for elt in node.slice.elts):
+                    new_slice = ast.ExtSlice([
+                        adjust_slice(x) for x in self._visit(node.slice.elts)
+                    ])
+                else:
+                    value = ast.Tuple(self._visit(node.slice.elts), ast.Load())
+                    ast.copy_location(value, node.slice)
+                    new_slice = ast.Index(value)
+            else:
+                new_slice = adjust_slice(self._visit(node.slice))
+            ast.copy_location(new_slice, node.slice)
+
+            new_node = ast.Subscript(
+                self._visit(node.value),
+                new_slice,
+                self._visit(node.ctx), )
+            ast.copy_location(new_node, node)
+            return new_node
+
+    if sys.version_info.minor < 8:
+
+        def visit_Module(self, node):
+            new_node = ast.Module(self._visit(node.body))
+            return new_node
+
+        def visit_Constant(self, node):
+            if node.value is None:
+                new_node = ast.NameConstant(node.value)
+            elif node.value is Ellipsis:
+                new_node = ast.Ellipsis()
+            elif isinstance(node.value, bool):
+                new_node = ast.NameConstant(node.value)
+            elif isinstance(node.value, (int, float, complex)):
+                new_node = ast.Num(node.value)
+            elif isinstance(node.value, str):
+                new_node = ast.Str(node.value)
+            else:
+                new_node = ast.Bytes(node.value)
+            ast.copy_location(new_node, node)
+            return new_node
+
+    def _make_arg(self, node):
+        if node is None:
+            return None
+
+        if sys.version_info.minor < 8:
+            extra_args = tuple()
+        else:
+            extra_args = self._visit(node.type_comment),
+
+        new_node = ast.arg(
+            self._visit(node.id), self._visit(node.annotation), *extra_args)
+        return ast.copy_location(new_node, node)
+
+    def visit_Name(self, node):
+        new_node = ast.Name(
+            self._visit(node.id),
+            self._visit(node.ctx), )
+        ast.copy_location(new_node, node)
+        return new_node
+
+    def visit_ExceptHandler(self, node):
+        if node.name:
+            new_node = ast.ExceptHandler(
+                self._visit(node.type), node.name.id, self._visit(node.body))
+            return ast.copy_location(new_node, node)
+        else:
+            return self.generic_visit(node)
+
+    if sys.version_info.minor < 5:
+
+        def visit_Call(self, node):
+            if node.args and isinstance(node.args[-1], gast.Starred):
+                args = node.args[:-1]
+                starargs = node.args[-1].value
+            else:
+                args = node.args
+                starargs = None
+
+            if node.keywords and node.keywords[-1].arg is None:
+                keywords = node.keywords[:-1]
+                kwargs = node.keywords[-1].value
+            else:
+                keywords = node.keywords
+                kwargs = None
+
+            new_node = ast.Call(
+                self._visit(node.func),
+                self._visit(args),
+                self._visit(keywords),
+                self._visit(starargs),
+                self._visit(kwargs), )
+            ast.copy_location(new_node, node)
+            return new_node
+
+        def visit_ClassDef(self, node):
+            self.generic_visit(node)
+            new_node = ast.ClassDef(
+                name=self._visit(node.name),
+                bases=self._visit(node.bases),
+                keywords=self._visit(node.keywords),
+                body=self._visit(node.body),
+                decorator_list=self._visit(node.decorator_list),
+                starargs=None,
+                kwargs=None, )
+            return ast.copy_location(new_node, node)
+
+    elif sys.version_info.minor < 8:
+
+        def visit_FunctionDef(self, node):
+            new_node = ast.FunctionDef(
+                self._visit(node.name),
+                self._visit(node.args),
+                self._visit(node.body),
+                self._visit(node.decorator_list),
+                self._visit(node.returns), )
+            ast.copy_location(new_node, node)
+            return new_node
+
+        def visit_AsyncFunctionDef(self, node):
+            new_node = ast.AsyncFunctionDef(
+                self._visit(node.name),
+                self._visit(node.args),
+                self._visit(node.body),
+                self._visit(node.decorator_list),
+                self._visit(node.returns), )
+            ast.copy_location(new_node, node)
+            return new_node
+
+        def visit_For(self, node):
+            new_node = ast.For(
+                self._visit(node.target),
+                self._visit(node.iter),
+                self._visit(node.body),
+                self._visit(node.orelse), )
+            ast.copy_location(new_node, node)
+            return new_node
+
+        def visit_AsyncFor(self, node):
+            new_node = ast.AsyncFor(
+                self._visit(node.target),
+                self._visit(node.iter),
+                self._visit(node.body),
+                self._visit(node.orelse),
+                None,  # type_comment
+            )
+            ast.copy_location(new_node, node)
+            return new_node
+
+        def visit_With(self, node):
+            new_node = ast.With(
+                self._visit(node.items),
+                self._visit(node.body), )
+            ast.copy_location(new_node, node)
+            return new_node
+
+        def visit_AsyncWith(self, node):
+            new_node = ast.AsyncWith(
+                self._visit(node.items),
+                self._visit(node.body), )
+            ast.copy_location(new_node, node)
+            return new_node
+
+        def visit_Call(self, node):
+            new_node = ast.Call(
+                self._visit(node.func),
+                self._visit(node.args),
+                self._visit(node.keywords), )
+            ast.copy_location(new_node, node)
+            return new_node
+
+    def visit_arguments(self, node):
+        extra_args = [
+            self._make_arg(node.vararg),
+            [self._make_arg(n) for n in node.kwonlyargs],
+            self._visit(node.kw_defaults),
+            self._make_arg(node.kwarg),
+            self._visit(node.defaults),
+        ]
+        if sys.version_info.minor >= 8:
+            new_node = ast.arguments(
+                [self._make_arg(arg) for arg in node.posonlyargs],
+                [self._make_arg(n) for n in node.args], *extra_args)
+        else:
+            new_node = ast.arguments([self._make_arg(n) for n in node.args],
+                                     *extra_args)
+        return new_node
+
+
+def ast_to_gast(node):
+    return Ast3ToGAst().visit(node)
+
+
+def gast_to_ast(node):
+    return GAstToAst3().visit(node)
diff --git a/python/paddle/utils/gast/astn.py b/python/paddle/utils/gast/astn.py
new file mode 100644
index 00000000000000..bd88ba5efc512a
--- /dev/null
+++ b/python/paddle/utils/gast/astn.py
@@ -0,0 +1,64 @@
+# Copyright (c) 2016, Serge Guelton
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 	Redistributions of source code must retain the above copyright notice, this
+# 	list of conditions and the following disclaimer.
+
+# 	Redistributions in binary form must reproduce the above copyright notice,
+# 	this list of conditions and the following disclaimer in the documentation
+# 	and/or other materials provided with the distribution.
+
+# 	Neither the name of HPCProject, Serge Guelton nor the names of its
+# 	contributors may be used to endorse or promote products derived from this
+# 	software without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# NOTE(paddle-dev): We introduce third-party library Gast as unified AST
+# representation. See https://github.com/serge-sans-paille/gast for details.
+
+import ast
+from . import gast
+
+
+def _generate_translators(to):
+    class Translator(ast.NodeTransformer):
+        def _visit(self, node):
+            if isinstance(node, list):
+                return [self._visit(n) for n in node]
+            elif isinstance(node, ast.AST):
+                return self.visit(node)
+            else:
+                return node
+
+        def generic_visit(self, node):
+            cls = type(node).__name__
+            # handle nodes that are not part of the AST
+            if not hasattr(to, cls):
+                return
+            new_node = getattr(to, cls)()
+            for field in node._fields:
+                setattr(new_node, field, self._visit(getattr(node, field)))
+            for attr in getattr(node, '_attributes'):
+                if hasattr(node, attr):
+                    setattr(new_node, attr, getattr(node, attr))
+            return new_node
+
+    return Translator
+
+
+AstToGAst = _generate_translators(gast)
+
+GAstToAst = _generate_translators(ast)
diff --git a/python/paddle/utils/gast/gast.py b/python/paddle/utils/gast/gast.py
new file mode 100644
index 00000000000000..f561c83995ac1d
--- /dev/null
+++ b/python/paddle/utils/gast/gast.py
@@ -0,0 +1,609 @@
+# Copyright (c) 2016, Serge Guelton
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 	Redistributions of source code must retain the above copyright notice, this
+# 	list of conditions and the following disclaimer.
+
+# 	Redistributions in binary form must reproduce the above copyright notice,
+# 	this list of conditions and the following disclaimer in the documentation
+# 	and/or other materials provided with the distribution.
+
+# 	Neither the name of HPCProject, Serge Guelton nor the names of its
+# 	contributors may be used to endorse or promote products derived from this
+# 	software without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# NOTE(paddle-dev): We introduce third-party library Gast as unified AST
+# representation. See https://github.com/serge-sans-paille/gast for details.
+
+import sys as _sys
+import ast as _ast
+from ast import boolop, cmpop, excepthandler, expr, expr_context, operator
+from ast import slice, stmt, unaryop, mod, AST
+from ast import iter_child_nodes, walk
+
+try:
+    from ast import TypeIgnore
+except ImportError:
+
+    class TypeIgnore(AST):
+        pass
+
+
+def _make_node(Name, Fields, Attributes, Bases):
+    def create_node(self, *args, **kwargs):
+        nbparam = len(args) + len(kwargs)
+        assert nbparam in (0, len(Fields)), \
+            "Bad argument number for {}: {}, expecting {}".\
+            format(Name, nbparam, len(Fields))
+        self._fields = Fields
+        self._attributes = Attributes
+        for argname, argval in zip(self._fields, args):
+            setattr(self, argname, argval)
+        for argname, argval in kwargs.items():
+            assert argname in Fields, \
+                    "Invalid Keyword argument for {}: {}".format(Name, argname)
+            setattr(self, argname, argval)
+
+    setattr(_sys.modules[__name__], Name,
+            type(Name, Bases, {'__init__': create_node}))
+
+
+_nodes = (
+    # mod
+    ('Module', (('body', 'type_ignores'), (), (mod, ))),
+    ('Interactive', (('body', ), (), (mod, ))),
+    ('Expression', (('body', ), (), (mod, ))),
+    ('FunctionType', (('argtypes', 'returns'), (), (mod, ))),
+    ('Suite', (('body', ), (), (mod, ))),
+
+    # stmt
+    ('FunctionDef', (('name', 'args', 'body', 'decorator_list', 'returns',
+                      'type_comment'), (
+                          'lineno',
+                          'col_offset',
+                          'end_lineno',
+                          'end_col_offset', ), (stmt, ))),
+    ('AsyncFunctionDef', (('name', 'args', 'body', 'decorator_list', 'returns',
+                           'type_comment'), (
+                               'lineno',
+                               'col_offset',
+                               'end_lineno',
+                               'end_col_offset', ), (stmt, ))),
+    ('ClassDef', ((
+        'name',
+        'bases',
+        'keywords',
+        'body',
+        'decorator_list', ), (
+            'lineno',
+            'col_offset',
+            'end_lineno',
+            'end_col_offset', ), (stmt, ))),
+    ('Return', (('value', ), (
+        'lineno',
+        'col_offset',
+        'end_lineno',
+        'end_col_offset', ), (stmt, ))),
+    ('Delete', (('targets', ), (
+        'lineno',
+        'col_offset',
+        'end_lineno',
+        'end_col_offset', ), (stmt, ))),
+    ('Assign', ((
+        'targets',
+        'value', ), (
+            'lineno',
+            'col_offset',
+            'end_lineno',
+            'end_col_offset', ), (stmt, ))),
+    ('AugAssign', ((
+        'target',
+        'op',
+        'value', ), (
+            'lineno',
+            'col_offset',
+            'end_lineno',
+            'end_col_offset', ), (stmt, ))),
+    ('AnnAssign', ((
+        'target',
+        'annotation',
+        'value',
+        'simple', ), (
+            'lineno',
+            'col_offset',
+            'end_lineno',
+            'end_col_offset', ), (stmt, ))),
+    ('Print', ((
+        'dest',
+        'values',
+        'nl', ), (
+            'lineno',
+            'col_offset',
+            'end_lineno',
+            'end_col_offset', ), (stmt, ))),
+    ('For', (('target', 'iter', 'body', 'orelse', 'type_comment'), (
+        'lineno',
+        'col_offset',
+        'end_lineno',
+        'end_col_offset', ), (stmt, ))),
+    ('AsyncFor', (('target', 'iter', 'body', 'orelse', 'type_comment'), (
+        'lineno',
+        'col_offset',
+        'end_lineno',
+        'end_col_offset', ), (stmt, ))),
+    ('While', ((
+        'test',
+        'body',
+        'orelse', ), (
+            'lineno',
+            'col_offset',
+            'end_lineno',
+            'end_col_offset', ), (stmt, ))),
+    ('If', ((
+        'test',
+        'body',
+        'orelse', ), (
+            'lineno',
+            'col_offset',
+            'end_lineno',
+            'end_col_offset', ), (stmt, ))),
+    ('With', (('items', 'body', 'type_comment'), (
+        'lineno',
+        'col_offset',
+        'end_lineno',
+        'end_col_offset', ), (stmt, ))),
+    ('AsyncWith', (('items', 'body', 'type_comment'), (
+        'lineno',
+        'col_offset',
+        'end_lineno',
+        'end_col_offset', ), (stmt, ))),
+    ('Raise', ((
+        'exc',
+        'cause', ), (
+            'lineno',
+            'col_offset',
+            'end_lineno',
+            'end_col_offset', ), (stmt, ))),
+    ('Try', ((
+        'body',
+        'handlers',
+        'orelse',
+        'finalbody', ), (
+            'lineno',
+            'col_offset',
+            'end_lineno',
+            'end_col_offset', ), (stmt, ))),
+    ('Assert', ((
+        'test',
+        'msg', ), (
+            'lineno',
+            'col_offset',
+            'end_lineno',
+            'end_col_offset', ), (stmt, ))),
+    ('Import', (('names', ), (
+        'lineno',
+        'col_offset',
+        'end_lineno',
+        'end_col_offset', ), (stmt, ))),
+    ('ImportFrom', ((
+        'module',
+        'names',
+        'level', ), (
+            'lineno',
+            'col_offset',
+            'end_lineno',
+            'end_col_offset', ), (stmt, ))),
+    ('Exec', ((
+        'body',
+        'globals',
+        'locals', ), (
+            'lineno',
+            'col_offset',
+            'end_lineno',
+            'end_col_offset', ), (stmt, ))),
+    ('Global', (('names', ), (
+        'lineno',
+        'col_offset',
+        'end_lineno',
+        'end_col_offset', ), (stmt, ))),
+    ('Nonlocal', (('names', ), (
+        'lineno',
+        'col_offset',
+        'end_lineno',
+        'end_col_offset', ), (stmt, ))),
+    ('Expr', (('value', ), (
+        'lineno',
+        'col_offset',
+        'end_lineno',
+        'end_col_offset', ), (stmt, ))),
+    ('Pass', ((), (
+        'lineno',
+        'col_offset',
+        'end_lineno',
+        'end_col_offset', ), (stmt, ))),
+    ('Break', ((), (
+        'lineno',
+        'col_offset',
+        'end_lineno',
+        'end_col_offset', ), (stmt, ))),
+    ('Continue', ((), (
+        'lineno',
+        'col_offset',
+        'end_lineno',
+        'end_col_offset', ), (stmt, ))),
+
+    # expr
+    ('BoolOp', ((
+        'op',
+        'values', ), (
+            'lineno',
+            'col_offset',
+            'end_lineno',
+            'end_col_offset', ), (expr, ))),
+    ('BinOp', ((
+        'left',
+        'op',
+        'right', ), (
+            'lineno',
+            'col_offset',
+            'end_lineno',
+            'end_col_offset', ), (expr, ))),
+    ('UnaryOp', ((
+        'op',
+        'operand', ), (
+            'lineno',
+            'col_offset',
+            'end_lineno',
+            'end_col_offset', ), (expr, ))),
+    ('Lambda', ((
+        'args',
+        'body', ), (
+            'lineno',
+            'col_offset',
+            'end_lineno',
+            'end_col_offset', ), (expr, ))),
+    ('IfExp', ((
+        'test',
+        'body',
+        'orelse', ), (
+            'lineno',
+            'col_offset',
+            'end_lineno',
+            'end_col_offset', ), (expr, ))),
+    ('Dict', ((
+        'keys',
+        'values', ), (
+            'lineno',
+            'col_offset',
+            'end_lineno',
+            'end_col_offset', ), (expr, ))),
+    ('Set', (('elts', ), (
+        'lineno',
+        'col_offset',
+        'end_lineno',
+        'end_col_offset', ), (expr, ))),
+    ('ListComp', ((
+        'elt',
+        'generators', ), (
+            'lineno',
+            'col_offset',
+            'end_lineno',
+            'end_col_offset', ), (expr, ))),
+    ('SetComp', ((
+        'elt',
+        'generators', ), (
+            'lineno',
+            'col_offset',
+            'end_lineno',
+            'end_col_offset', ), (expr, ))),
+    ('DictComp', ((
+        'key',
+        'value',
+        'generators', ), (
+            'lineno',
+            'col_offset',
+            'end_lineno',
+            'end_col_offset', ), (expr, ))),
+    ('GeneratorExp', ((
+        'elt',
+        'generators', ), (
+            'lineno',
+            'col_offset',
+            'end_lineno',
+            'end_col_offset', ), (expr, ))),
+    ('Await', (('value', ), (
+        'lineno',
+        'col_offset',
+        'end_lineno',
+        'end_col_offset', ), (expr, ))),
+    ('Yield', (('value', ), (
+        'lineno',
+        'col_offset',
+        'end_lineno',
+        'end_col_offset', ), (expr, ))),
+    ('YieldFrom', (('value', ), (
+        'lineno',
+        'col_offset',
+        'end_lineno',
+        'end_col_offset', ), (expr, ))),
+    ('Compare', ((
+        'left',
+        'ops',
+        'comparators', ), (
+            'lineno',
+            'col_offset',
+            'end_lineno',
+            'end_col_offset', ), (expr, ))),
+    ('Call', ((
+        'func',
+        'args',
+        'keywords', ), (
+            'lineno',
+            'col_offset',
+            'end_lineno',
+            'end_col_offset', ), (expr, ))),
+    ('Repr', (('value', ), (
+        'lineno',
+        'col_offset',
+        'end_lineno',
+        'end_col_offset', ), (expr, ))),
+    ('FormattedValue', ((
+        'value',
+        'conversion',
+        'format_spec', ), (
+            'lineno',
+            'col_offset',
+            'end_lineno',
+            'end_col_offset', ), (expr, ))),
+    ('JoinedStr', (('values', ), (
+        'lineno',
+        'col_offset',
+        'end_lineno',
+        'end_col_offset', ), (expr, ))),
+    ('Constant', (('value', 'kind'), (
+        'lineno',
+        'col_offset',
+        'end_lineno',
+        'end_col_offset', ), (expr, ))),
+    ('Attribute', ((
+        'value',
+        'attr',
+        'ctx', ), (
+            'lineno',
+            'col_offset',
+            'end_lineno',
+            'end_col_offset', ), (expr, ))),
+    ('Subscript', ((
+        'value',
+        'slice',
+        'ctx', ), (
+            'lineno',
+            'col_offset',
+            'end_lineno',
+            'end_col_offset', ), (expr, ))),
+    ('Starred', ((
+        'value',
+        'ctx', ), (
+            'lineno',
+            'col_offset',
+            'end_lineno',
+            'end_col_offset', ), (expr, ))),
+    ('Name', (('id', 'ctx', 'annotation', 'type_comment'), (
+        'lineno',
+        'col_offset',
+        'end_lineno',
+        'end_col_offset', ), (expr, ))),
+    ('List', ((
+        'elts',
+        'ctx', ), (
+            'lineno',
+            'col_offset',
+            'end_lineno',
+            'end_col_offset', ), (expr, ))),
+    ('Tuple', ((
+        'elts',
+        'ctx', ), (
+            'lineno',
+            'col_offset',
+            'end_lineno',
+            'end_col_offset', ), (expr, ))),
+
+    # expr_context
+    ('Load', ((), (), (expr_context, ))),
+    ('Store', ((), (), (expr_context, ))),
+    ('Del', ((), (), (expr_context, ))),
+    ('AugLoad', ((), (), (expr_context, ))),
+    ('AugStore', ((), (), (expr_context, ))),
+    ('Param', ((), (), (expr_context, ))),
+
+    # slice
+    ('Slice', (('lower', 'upper', 'step'), (
+        'lineno',
+        'col_offset',
+        'end_lineno',
+        'end_col_offset', ), (slice, ))),
+
+    # boolop
+    ('And', ((), (), (boolop, ))),
+    ('Or', ((), (), (boolop, ))),
+
+    # operator
+    ('Add', ((), (), (operator, ))),
+    ('Sub', ((), (), (operator, ))),
+    ('Mult', ((), (), (operator, ))),
+    ('MatMult', ((), (), (operator, ))),
+    ('Div', ((), (), (operator, ))),
+    ('Mod', ((), (), (operator, ))),
+    ('Pow', ((), (), (operator, ))),
+    ('LShift', ((), (), (operator, ))),
+    ('RShift', ((), (), (operator, ))),
+    ('BitOr', ((), (), (operator, ))),
+    ('BitXor', ((), (), (operator, ))),
+    ('BitAnd', ((), (), (operator, ))),
+    ('FloorDiv', ((), (), (operator, ))),
+
+    # unaryop
+    ('Invert', ((), (), (
+        unaryop,
+        AST, ))),
+    ('Not', ((), (), (
+        unaryop,
+        AST, ))),
+    ('UAdd', ((), (), (
+        unaryop,
+        AST, ))),
+    ('USub', ((), (), (
+        unaryop,
+        AST, ))),
+
+    # cmpop
+    ('Eq', ((), (), (cmpop, ))),
+    ('NotEq', ((), (), (cmpop, ))),
+    ('Lt', ((), (), (cmpop, ))),
+    ('LtE', ((), (), (cmpop, ))),
+    ('Gt', ((), (), (cmpop, ))),
+    ('GtE', ((), (), (cmpop, ))),
+    ('Is', ((), (), (cmpop, ))),
+    ('IsNot', ((), (), (cmpop, ))),
+    ('In', ((), (), (cmpop, ))),
+    ('NotIn', ((), (), (cmpop, ))),
+
+    # comprehension
+    ('comprehension', (('target', 'iter', 'ifs', 'is_async'), (), (AST, ))),
+
+    # excepthandler
+    ('ExceptHandler', (('type', 'name', 'body'),
+                       ('lineno', 'col_offset', 'end_lineno',
+                        'end_col_offset'), (excepthandler, ))),
+
+    # arguments
+    ('arguments', (('args', 'posonlyargs', 'vararg', 'kwonlyargs',
+                    'kw_defaults', 'kwarg', 'defaults'), (), (AST, ))),
+
+    # keyword
+    ('keyword',
+     (('arg', 'value'),
+      ('lineno', 'col_offset', 'end_lineno', 'end_col_offset'), (AST, ))),
+
+    # alias
+    ('alias', (('name', 'asname'), (), (AST, ))),
+
+    # withitem
+    ('withitem', (('context_expr', 'optional_vars'), (), (AST, ))),
+
+    # type_ignore
+    ('type_ignore', ((), ('lineno', 'tag'), (TypeIgnore, ))), )
+
+for name, descr in _nodes:
+    _make_node(name, *descr)
+
+py_version = _sys.version_info.major
+if py_version != 3:
+    raise RuntimeError(
+        'Required Python version >= 3, but received Python version == {}'.
+        format(py_version))
+
+from .ast3 import ast_to_gast, gast_to_ast
+
+
+def parse(*args, **kwargs):
+    return ast_to_gast(_ast.parse(*args, **kwargs))
+
+
+def literal_eval(node_or_string):
+    if isinstance(node_or_string, AST):
+        node_or_string = gast_to_ast(node_or_string)
+    return _ast.literal_eval(node_or_string)
+
+
+def get_docstring(node, clean=True):
+    if not isinstance(node, (FunctionDef, ClassDef, Module)):
+        raise TypeError("%r can't have docstrings" % node.__class__.__name__)
+    if node.body and isinstance(node.body[0], Expr) and \
+       isinstance(node.body[0].value, Constant):
+        if clean:
+            import inspect
+            holder = node.body[0].value
+            return inspect.cleandoc(getattr(holder, holder._fields[0]))
+        return node.body[0].value.s
+
+
+# the following are directly imported from python3.8's Lib/ast.py  #
+
+
+def copy_location(new_node, old_node):
+    """
+    Copy source location (`lineno`, `col_offset`, `end_lineno`, and
+    `end_col_offset` attributes) from *old_node* to *new_node* if possible,
+    and return *new_node*.
+    """
+    for attr in 'lineno', 'col_offset', 'end_lineno', 'end_col_offset':
+        if attr in old_node._attributes and attr in new_node._attributes \
+           and hasattr(old_node, attr):
+            setattr(new_node, attr, getattr(old_node, attr))
+    return new_node
+
+
+def fix_missing_locations(node):
+    """
+    When you compile a node tree with compile(), the compiler expects lineno
+    and col_offset attributes for every node that supports them.  This is
+    rather tedious to fill in for generated nodes, so this helper adds these
+    attributes recursively where not already set, by setting them to the values
+    of the parent node.  It works recursively starting at *node*.
+    """
+
+    def _fix(node, lineno, col_offset, end_lineno, end_col_offset):
+        if 'lineno' in node._attributes:
+            if not hasattr(node, 'lineno'):
+                node.lineno = lineno
+            else:
+                lineno = node.lineno
+        if 'end_lineno' in node._attributes:
+            if not hasattr(node, 'end_lineno'):
+                node.end_lineno = end_lineno
+            else:
+                end_lineno = node.end_lineno
+        if 'col_offset' in node._attributes:
+            if not hasattr(node, 'col_offset'):
+                node.col_offset = col_offset
+            else:
+                col_offset = node.col_offset
+        if 'end_col_offset' in node._attributes:
+            if not hasattr(node, 'end_col_offset'):
+                node.end_col_offset = end_col_offset
+            else:
+                end_col_offset = node.end_col_offset
+        for child in iter_child_nodes(node):
+            _fix(child, lineno, col_offset, end_lineno, end_col_offset)
+
+    _fix(node, 1, 0, 1, 0)
+    return node
+
+
+def increment_lineno(node, n=1):
+    """
+    Increment the line number and end line number of each node in the tree
+    starting at *node* by *n*. This is useful to "move code" to a different
+    location in a file.
+    """
+    for child in walk(node):
+        if 'lineno' in child._attributes:
+            child.lineno = (getattr(child, 'lineno', 0) or 0) + n
+        if 'end_lineno' in child._attributes:
+            child.end_lineno = (getattr(child, 'end_lineno', 0) or 0) + n
+    return node
diff --git a/python/requirements.txt b/python/requirements.txt
index e9da2aa24d6cb2..4232700761581c 100644
--- a/python/requirements.txt
+++ b/python/requirements.txt
@@ -2,8 +2,6 @@ requests>=2.20.0
 numpy>=1.13 ; python_version>="3.5" and platform_system != "Windows"
 numpy>=1.13, <=1.19.3 ; python_version>="3.5" and platform_system == "Windows"
 protobuf>=3.1.0
-gast>=0.3.3, <=0.4.0 ; platform_system != "Windows"
-gast==0.3.3 ; platform_system == "Windows"
 Pillow
 six
 decorator
diff --git a/python/setup.py.in b/python/setup.py.in
index 0db6c0c27d743d..d530f8483bcde7 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -139,6 +139,7 @@ write_distributed_training_mode_py(filename='@PADDLE_BINARY_DIR@/python/paddle/f
 packages=['paddle',
           'paddle.libs',
           'paddle.utils',
+          'paddle.utils.gast',
           'paddle.utils.cpp_extension',
           'paddle.dataset',
           'paddle.reader',
@@ -149,6 +150,7 @@ packages=['paddle',
           'paddle.incubate.operators',
           'paddle.distributed.fleet',
           'paddle.distributed.fleet.base',
+          'paddle.distributed.fleet.elastic',
           'paddle.distributed.fleet.meta_optimizers',
           'paddle.distributed.fleet.meta_optimizers.sharding',
           'paddle.distributed.fleet.meta_optimizers.ascend',
@@ -393,11 +395,11 @@ def find_files(pattern, root, recursive=False):
 headers = (
     list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle')) +
     list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/fluid/extension/include')) +  # extension
-    list(find_files('*', '${BOOST_INCLUDE_DIR}/boost', True)) + # boost
     # For paddle uew custom op, only copy data type headers from `paddle/fluid/platform`
     # to `extension/incude`,
     ['@PADDLE_SOURCE_DIR@/paddle/fluid/platform/complex.h'] +
-    ['@PADDLE_SOURCE_DIR@/paddle/fluid/platform/float16.h'])
+    ['@PADDLE_SOURCE_DIR@/paddle/fluid/platform/float16.h'] +
+    ['@PADDLE_SOURCE_DIR@/paddle/utils/any.h'])
 
 if '${WITH_MKLDNN}' == 'ON':
     headers += list(find_files('*', '${MKLDNN_INSTALL_DIR}/include')) # mkldnn
@@ -444,12 +446,12 @@ class InstallHeaders(Command):
         elif 'third_party' not in header:
             # paddle headers
             install_dir = re.sub('@PADDLE_SOURCE_DIR@/', '', header)
-            if 'fluid' in install_dir:
+            if 'fluid' in install_dir or 'utils' in install_dir:
                 install_dir = "paddle/extension/include/"
         else:
             # third_party
             install_dir = re.sub('${THIRD_PARTY_PATH}', 'third_party', header)
-            patterns = ['boost/src/extern_boost', 'install/mkldnn/include']
+            patterns = ['install/mkldnn/include']
             for pattern in patterns:
                 install_dir = re.sub(pattern, '', install_dir)
         install_dir = os.path.join(self.install_dir, os.path.dirname(install_dir))